diff options
author | Zoltan Herczeg <hzmester@freemail.hu> | 2023-12-06 12:17:20 +0000 |
---|---|---|
committer | Zoltan Herczeg <hzmester@freemail.hu> | 2023-12-06 12:17:20 +0000 |
commit | 4e8fdb3e06d015c998f69ad3ea4f826b836aea0f (patch) | |
tree | 839f0a0e9004bcdadf7c04b06dd06ae7535d716e | |
parent | c3529d0227edcb614eec195eff309d4eb914dce5 (diff) | |
download | pcre-4e8fdb3e06d015c998f69ad3ea4f826b836aea0f.tar.gz |
JIT compiler update
-rw-r--r-- | src/pcre2_jit_simd_inc.h | 31 | ||||
-rw-r--r-- | src/sljit/sljitConfigInternal.h | 112 | ||||
-rw-r--r-- | src/sljit/sljitLir.c | 124 | ||||
-rw-r--r-- | src/sljit/sljitLir.h | 28 | ||||
-rw-r--r-- | src/sljit/sljitNativeARM_32.c | 104 | ||||
-rw-r--r-- | src/sljit/sljitNativeARM_64.c | 269 | ||||
-rw-r--r-- | src/sljit/sljitNativeARM_T2_32.c | 102 | ||||
-rw-r--r-- | src/sljit/sljitNativeLOONGARCH_64.c | 21 | ||||
-rw-r--r-- | src/sljit/sljitNativeMIPS_common.c | 33 | ||||
-rw-r--r-- | src/sljit/sljitNativeRISCV_32.c | 1 | ||||
-rw-r--r-- | src/sljit/sljitNativeRISCV_64.c | 4 | ||||
-rw-r--r-- | src/sljit/sljitNativeRISCV_common.c | 12 | ||||
-rw-r--r-- | src/sljit/sljitNativeS390X.c | 481 | ||||
-rw-r--r-- | src/sljit/sljitNativeX86_32.c | 44 | ||||
-rw-r--r-- | src/sljit/sljitNativeX86_64.c | 16 | ||||
-rw-r--r-- | src/sljit/sljitNativeX86_common.c | 366 |
16 files changed, 1293 insertions, 455 deletions
diff --git a/src/pcre2_jit_simd_inc.h b/src/pcre2_jit_simd_inc.h index c178d320..26d5e2e0 100644 --- a/src/pcre2_jit_simd_inc.h +++ b/src/pcre2_jit_simd_inc.h @@ -483,11 +483,7 @@ sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3); sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR4); sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR5); sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR6); -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) -sljit_s32 tmp2_ind = 0; -#else /* !SLJIT_CONFIG_X86_32 */ -sljit_s32 tmp2_ind = 4; -#endif /* SLJIT_CONFIG_X86_32 */ +sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_TMP_FR0); struct sljit_label *start; #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 struct sljit_label *restart; @@ -660,19 +656,7 @@ for (i = 0; i < 4; i++) fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind); } -/* PAND xmm1, xmm2/m128 */ -if (reg_type == SLJIT_SIMD_REG_256) - { - instruction[0] = 0xc5; - instruction[1] = (sljit_u8)(0xfd ^ (data1_ind << 3)); - } - -/* instruction[0] = 0x66 / 0xc5; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xdb; -instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind; -sljit_emit_op_custom(compiler, instruction, 4); - +sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1); sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0); /* Ignore matches before the first STR_PTR. */ @@ -700,16 +684,7 @@ for (i = 0; i < 4; i++) fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind); } -/* PAND xmm1, xmm2/m128 */ -if (reg_type == SLJIT_SIMD_REG_256) - instruction[1] = (sljit_u8)(0xfd ^ (data1_ind << 3)); - -/* instruction[0] = 0x66 / 0xc5; */ -/* instruction[1] = 0x0f; */ -instruction[2] = 0xdb; -instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind; -sljit_emit_op_custom(compiler, instruction, 4); - +sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1); sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0); CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start); diff --git a/src/sljit/sljitConfigInternal.h b/src/sljit/sljitConfigInternal.h index 3d0e3da4..d224248c 100644 --- a/src/sljit/sljitConfigInternal.h +++ b/src/sljit/sljitConfigInternal.h @@ -72,6 +72,8 @@ extern "C" { SLJIT_NUMBER_OF_FLOAT_REGISTERS : number of available floating point registers SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS : number of available floating point scratch registers SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS : number of available floating point saved registers + SLJIT_NUMBER_OF_TEMPORARY_REGISTERS : number of available temporary registers + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS : number of available temporary floating point registers SLJIT_WORD_SHIFT : the shift required to apply when accessing a sljit_sw/sljit_uw array by index SLJIT_F32_SHIFT : the shift required to apply when accessing a single precision floating point array by index @@ -81,8 +83,21 @@ extern "C" { the scratch register index of ecx is stored in this variable SLJIT_LOCALS_OFFSET : local space starting offset (SLJIT_SP + SLJIT_LOCALS_OFFSET) SLJIT_RETURN_ADDRESS_OFFSET : a return instruction always adds this offset to the return address + SLJIT_CONV_MAX_FLOAT : result when a floating point value is converted to integer + and the floating point value is higher than the maximum integer value + (possible values: SLJIT_CONV_RESULT_MAX_INT or SLJIT_CONV_RESULT_MIN_INT) + SLJIT_CONV_MIN_FLOAT : result when a floating point value is converted to integer + and the floating point value is lower than the minimum integer value + (possible values: SLJIT_CONV_RESULT_MAX_INT or SLJIT_CONV_RESULT_MIN_INT) + SLJIT_CONV_NAN_FLOAT : result when a NaN floating point value is converted to integer + (possible values: SLJIT_CONV_RESULT_MAX_INT, SLJIT_CONV_RESULT_MIN_INT, + or SLJIT_CONV_RESULT_ZERO) Other macros: + SLJIT_TMP_R0 .. R9 : accessing temporary registers + SLJIT_TMP_R(i) : accessing temporary registers + SLJIT_TMP_FR0 .. FR9 : accessing temporary floating point registers + SLJIT_TMP_FR(i) : accessing temporary floating point registers SLJIT_FUNC : calling convention attribute for both calling JIT from C and C calling back from JIT SLJIT_W(number) : defining 64 bit constants on 64 bit architectures (platform independent helper) SLJIT_F64_SECOND(reg) : provides the register index of the second 32 bit part of a 64 bit @@ -356,6 +371,38 @@ typedef double sljit_f64; #define SLJIT_F32_SHIFT 2 #define SLJIT_F64_SHIFT 3 +#define SLJIT_CONV_RESULT_MAX_INT 0 +#define SLJIT_CONV_RESULT_MIN_INT 1 +#define SLJIT_CONV_RESULT_ZERO 2 + +#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) +#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MIN_INT +#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#elif (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) +#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT +#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_ZERO +#elif (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS) +#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT +#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MAX_INT +#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MAX_INT +#elif (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC) +#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT +#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV) +#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT +#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MAX_INT +#elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) +#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT +#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MIN_INT +#else +#error "Result for float to integer conversion is not defined" +#endif + #ifndef SLJIT_W /* Defining long constants. */ @@ -528,8 +575,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_REGISTERS 12 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 7 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 1 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 7 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 1 #define SLJIT_LOCALS_OFFSET_BASE (8 * SSIZE_OF(sw)) #define SLJIT_PREF_SHIFT_REG SLJIT_R2 #define SLJIT_MASKED_SHIFT 1 @@ -538,7 +587,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #elif (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) #define SLJIT_NUMBER_OF_REGISTERS 13 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 2 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 15 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 1 #ifndef _WIN64 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 6 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0 @@ -556,16 +607,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_REGISTERS 12 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 8 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 2 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 14 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2 #define SLJIT_LOCALS_OFFSET_BASE 0 #elif (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64) #define SLJIT_NUMBER_OF_REGISTERS 26 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 10 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 3 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2 #define SLJIT_LOCALS_OFFSET_BASE (2 * (sljit_s32)sizeof(sljit_sw)) #define SLJIT_MASKED_SHIFT 1 #define SLJIT_MASKED_SHIFT32 1 @@ -574,8 +629,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_REGISTERS 23 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 17 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 3 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 18 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2 #if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) || (defined _AIX) #define SLJIT_LOCALS_OFFSET_BASE ((6 + 8) * (sljit_s32)sizeof(sljit_sw)) #elif (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32) @@ -598,6 +655,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 29 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8 #endif +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 5 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 3 #define SLJIT_MASKED_SHIFT 1 #define SLJIT_MASKED_SHIFT32 1 @@ -605,9 +664,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_REGISTERS 23 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 12 -#define SLJIT_LOCALS_OFFSET_BASE 0 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 5 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2 +#define SLJIT_LOCALS_OFFSET_BASE 0 #define SLJIT_MASKED_SHIFT 1 #define SLJIT_MASKED_SHIFT32 1 @@ -636,8 +697,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_REGISTERS 12 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 8 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 3 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 15 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 1 #define SLJIT_LOCALS_OFFSET_BASE SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE #define SLJIT_MASKED_SHIFT 1 @@ -645,9 +708,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_REGISTERS 23 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 10 -#define SLJIT_LOCALS_OFFSET_BASE 0 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 5 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2 +#define SLJIT_LOCALS_OFFSET_BASE 0 #define SLJIT_MASKED_SHIFT 1 #define SLJIT_MASKED_SHIFT32 1 @@ -656,8 +721,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); /* Just to have something. */ #define SLJIT_NUMBER_OF_REGISTERS 0 #define SLJIT_NUMBER_OF_SAVED_REGISTERS 0 +#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 0 #define SLJIT_NUMBER_OF_FLOAT_REGISTERS 0 #define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0 +#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 0 #define SLJIT_LOCALS_OFFSET_BASE 0 #endif @@ -670,6 +737,45 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #define SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS \ (SLJIT_NUMBER_OF_FLOAT_REGISTERS - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS) +/**********************************/ +/* Temporary register management. */ +/**********************************/ + +#define SLJIT_TMP_REGISTER_BASE (SLJIT_NUMBER_OF_REGISTERS + 2) +#define SLJIT_TMP_FREGISTER_BASE (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) + +/* WARNING: Accessing temporary registers is not recommended, because they + are also used by the JIT compiler for various computations. Using them + might have any side effects including incorrect operations and crashes, + so use them at your own risk. The machine registers themselves might have + limitations, e.g. the r0 register on s390x / ppc cannot be used as + base address for memory operations. */ + +/* Temporary registers */ +#define SLJIT_TMP_R0 (SLJIT_TMP_REGISTER_BASE + 0) +#define SLJIT_TMP_R1 (SLJIT_TMP_REGISTER_BASE + 1) +#define SLJIT_TMP_R2 (SLJIT_TMP_REGISTER_BASE + 2) +#define SLJIT_TMP_R3 (SLJIT_TMP_REGISTER_BASE + 3) +#define SLJIT_TMP_R4 (SLJIT_TMP_REGISTER_BASE + 4) +#define SLJIT_TMP_R5 (SLJIT_TMP_REGISTER_BASE + 5) +#define SLJIT_TMP_R6 (SLJIT_TMP_REGISTER_BASE + 6) +#define SLJIT_TMP_R7 (SLJIT_TMP_REGISTER_BASE + 7) +#define SLJIT_TMP_R8 (SLJIT_TMP_REGISTER_BASE + 8) +#define SLJIT_TMP_R9 (SLJIT_TMP_REGISTER_BASE + 9) +#define SLJIT_TMP_R(i) (SLJIT_TMP_REGISTER_BASE + (i)) + +#define SLJIT_TMP_FR0 (SLJIT_TMP_FREGISTER_BASE + 0) +#define SLJIT_TMP_FR1 (SLJIT_TMP_FREGISTER_BASE + 1) +#define SLJIT_TMP_FR2 (SLJIT_TMP_FREGISTER_BASE + 2) +#define SLJIT_TMP_FR3 (SLJIT_TMP_FREGISTER_BASE + 3) +#define SLJIT_TMP_FR4 (SLJIT_TMP_FREGISTER_BASE + 4) +#define SLJIT_TMP_FR5 (SLJIT_TMP_FREGISTER_BASE + 5) +#define SLJIT_TMP_FR6 (SLJIT_TMP_FREGISTER_BASE + 6) +#define SLJIT_TMP_FR7 (SLJIT_TMP_FREGISTER_BASE + 7) +#define SLJIT_TMP_FR8 (SLJIT_TMP_FREGISTER_BASE + 8) +#define SLJIT_TMP_FR9 (SLJIT_TMP_FREGISTER_BASE + 9) +#define SLJIT_TMP_FR(i) (SLJIT_TMP_FREGISTER_BASE + (i)) + /********************************/ /* CPU status flags management. */ /********************************/ @@ -690,7 +796,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr); #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \ || (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) #define SLJIT_F64_SECOND(reg) \ - ((reg) + SLJIT_FS0) + ((reg) + SLJIT_FS0 + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS) #else /* !SLJIT_CONFIG_ARM_32 && !SLJIT_CONFIG_MIPS_32 */ #define SLJIT_F64_SECOND(reg) \ (reg) diff --git a/src/sljit/sljitLir.c b/src/sljit/sljitLir.c index 564eb736..6f193000 100644 --- a/src/sljit/sljitLir.c +++ b/src/sljit/sljitLir.c @@ -142,11 +142,15 @@ #define SLJIT_KEPT_SAVEDS_COUNT(options) ((options) & 0x3) /* Getters for simd operations, which returns with log2(size). */ +#define SLJIT_SIMD_GET_OPCODE(type) ((type) & 0xff) #define SLJIT_SIMD_GET_REG_SIZE(type) (((type) >> 12) & 0x3f) #define SLJIT_SIMD_GET_ELEM_SIZE(type) (((type) >> 18) & 0x3f) -#define SLJIT_SIMD_GET_ALIGNMENT(type) (((type) >> 24) & 0x3f) #define SLJIT_SIMD_GET_ELEM2_SIZE(type) (((type) >> 24) & 0x3f) +#define SLJIT_SIMD_CHECK_REG(type) (((type) & 0x3f000) >= SLJIT_SIMD_REG_64 && ((type) & 0x3f000) <= SLJIT_SIMD_REG_512) +#define SLJIT_SIMD_TYPE_MASK(m) ((sljit_s32)0xff000fff & ~(SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | (m))) +#define SLJIT_SIMD_TYPE_MASK2(m) ((sljit_s32)0xc0000fff & ~(SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | (m))) + /* Jump flags. */ #define JUMP_LABEL 0x1 #define JUMP_ADDR 0x2 @@ -844,7 +848,8 @@ static sljit_s32 function_check_arguments(sljit_s32 arg_types, sljit_s32 scratch #define FUNCTION_CHECK_IS_REG(r) \ (((r) >= SLJIT_R0 && (r) < (SLJIT_R0 + compiler->scratches)) \ - || ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0)) + || ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0) \ + || ((r) >= SLJIT_TMP_REGISTER_BASE && (r) < (SLJIT_TMP_REGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_REGISTERS))) #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) #define CHECK_IF_VIRTUAL_REGISTER(p) ((p) <= SLJIT_S3 && (p) >= SLJIT_S8) @@ -952,7 +957,8 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s return 0; return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches)) - || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0); + || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0) + || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)); } #define FUNCTION_FCHECK(p, i, is_32) \ @@ -964,7 +970,8 @@ static sljit_s32 function_fcheck(struct sljit_compiler *compiler, sljit_s32 p, s return 0; if ((p >= SLJIT_FR0 && p < (SLJIT_FR0 + compiler->fscratches)) - || (p > (SLJIT_FS0 - compiler->fsaveds) && p <= SLJIT_FS0)) + || (p > (SLJIT_FS0 - compiler->fsaveds) && p <= SLJIT_FS0) + || (p >= SLJIT_TMP_FREGISTER_BASE && p < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS))) return (i == 0); return function_check_src_mem(compiler, p, i); @@ -999,26 +1006,30 @@ static void sljit_verbose_reg(struct sljit_compiler *compiler, sljit_s32 r) { if (r < (SLJIT_R0 + compiler->scratches)) fprintf(compiler->verbose, "r%d", r - SLJIT_R0); - else if (r != SLJIT_SP) + else if (r < SLJIT_SP) fprintf(compiler->verbose, "s%d", SLJIT_NUMBER_OF_REGISTERS - r); - else + else if (r == SLJIT_SP) fprintf(compiler->verbose, "sp"); + else + fprintf(compiler->verbose, "t%d", r - SLJIT_TMP_REGISTER_BASE); } static void sljit_verbose_freg(struct sljit_compiler *compiler, sljit_s32 r) { #if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \ || (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) - if (r >= (SLJIT_FS0 + SLJIT_FR0) && r <= (SLJIT_FS0 + SLJIT_FS0)) { + if (r >= SLJIT_F64_SECOND(SLJIT_FR0)) { fprintf(compiler->verbose, "^"); - r -= SLJIT_FS0; + r -= SLJIT_F64_SECOND(0); } #endif /* SLJIT_CONFIG_ARM_32 || SLJIT_CONFIG_MIPS_32 */ if (r < (SLJIT_FR0 + compiler->fscratches)) fprintf(compiler->verbose, "fr%d", r - SLJIT_FR0); - else + else if (r < SLJIT_TMP_FREGISTER_BASE) fprintf(compiler->verbose, "fs%d", SLJIT_NUMBER_OF_FLOAT_REGISTERS - r); + else + fprintf(compiler->verbose, "ft%d", r - SLJIT_TMP_FREGISTER_BASE); } static void sljit_verbose_param(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i) @@ -1121,6 +1132,10 @@ static const char* fop2r_names[] = { "copysign" }; +static const char* simd_op2_names[] = { + "and", "or", "xor" +}; + static const char* jump_names[] = { "equal", "not_equal", "less", "greater_equal", @@ -1705,10 +1720,12 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_register_index(sljit_s32 t SLJIT_UNUSED_ARG(reg); #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) if (type == SLJIT_GP_REGISTER) { - CHECK_ARGUMENT(reg > 0 && reg <= SLJIT_NUMBER_OF_REGISTERS); + CHECK_ARGUMENT((reg > 0 && reg <= SLJIT_NUMBER_OF_REGISTERS) + || (reg >= SLJIT_TMP_REGISTER_BASE && reg <= (SLJIT_TMP_REGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_REGISTERS))); } else { CHECK_ARGUMENT(type == SLJIT_FLOAT_REGISTER || ((type >> 12) == 0 || ((type >> 12) >= 3 && (type >> 12) <= 6))); - CHECK_ARGUMENT(reg > 0 && reg <= SLJIT_NUMBER_OF_FLOAT_REGISTERS); + CHECK_ARGUMENT((reg > 0 && reg <= SLJIT_NUMBER_OF_FLOAT_REGISTERS) + || (reg >= SLJIT_TMP_FREGISTER_BASE && reg <= (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS))); } #endif CHECK_RETURN_OK; @@ -1980,7 +1997,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fset64(struct sljit_compi if (SLJIT_UNLIKELY(!!compiler->verbose)) { fprintf(compiler->verbose, " fset64 "); sljit_verbose_freg(compiler, freg); - fprintf(compiler->verbose, ", %lf\n", value); + fprintf(compiler->verbose, ", %f\n", value); } #endif CHECK_RETURN_OK; @@ -2590,10 +2607,10 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & (sljit_s32)(0xc0000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0); - CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(SLJIT_SIMD_STORE)) == 0); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type)); - CHECK_ARGUMENT(SLJIT_SIMD_GET_ALIGNMENT(type) <= (srcdst & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0); + CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) <= (srcdst & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); FUNCTION_FCHECK(srcdst, srcdstw, 0); #endif @@ -2615,7 +2632,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com if ((type & 0x3f000000) == SLJIT_SIMD_MEM_UNALIGNED) fprintf(compiler->verbose, ".unal "); else - fprintf(compiler->verbose, ".al%d ", (8 << SLJIT_SIMD_GET_ALIGNMENT(type))); + fprintf(compiler->verbose, ".al%d ", (8 << SLJIT_SIMD_GET_ELEM2_SIZE(type))); sljit_verbose_freg(compiler, freg); fprintf(compiler->verbose, ", "); @@ -2632,8 +2649,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct slj { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0); - CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) == 0); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); @@ -2679,11 +2696,11 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct slji { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO | SLJIT_SIMD_LANE_SIGNED | SLJIT_32 | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO | SLJIT_SIMD_LANE_SIGNED | SLJIT_32)) == 0); CHECK_ARGUMENT((type & (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO)) != (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO)); CHECK_ARGUMENT((type & (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_SIGNED)) != SLJIT_SIMD_LANE_SIGNED); CHECK_ARGUMENT(!(type & SLJIT_SIMD_FLOAT) || !(type & (SLJIT_SIMD_LANE_SIGNED | SLJIT_32))); - CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(!(type & SLJIT_32) || SLJIT_SIMD_GET_ELEM_SIZE(type) <= 2); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); @@ -2731,8 +2748,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struc { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0); - CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) == 0); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src, 0)); @@ -2767,9 +2784,9 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_ { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & (sljit_s32)(0xc0000fff - (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(SLJIT_SIMD_EXTEND_SIGNED)) == 0); CHECK_ARGUMENT((type & (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT)) != (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT)); - CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_ELEM2_SIZE(type)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); @@ -2807,8 +2824,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co { #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); - CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | SLJIT_32))) == SLJIT_SIMD_STORE); - CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(SLJIT_32)) == SLJIT_SIMD_STORE); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type)); CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0)); FUNCTION_CHECK_DST(dst, dstw); @@ -2837,6 +2854,44 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co CHECK_RETURN_OK; } +static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ +#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS) + CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD)); + CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK(0)) <= SLJIT_SIMD_OP2_XOR); + CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type)); + CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src1_freg, 0)); + CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg, 0)); +#endif +#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) + if (SLJIT_UNLIKELY(!!compiler->verbose)) { + if (type & SLJIT_SIMD_TEST) + CHECK_RETURN_OK; + if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2_freg) == SLJIT_ERR_UNSUPPORTED) { + fprintf(compiler->verbose, " # simd_op2: unsupported form, no instructions are emitted\n"); + CHECK_RETURN_OK; + } + + fprintf(compiler->verbose, " simd_%s.%d.%s%d ", + simd_op2_names[SLJIT_SIMD_GET_OPCODE(type) - 1], + (8 << SLJIT_SIMD_GET_REG_SIZE(type)), + (type & SLJIT_SIMD_FLOAT) ? "f" : "", + (8 << SLJIT_SIMD_GET_ELEM_SIZE(type))); + + sljit_verbose_freg(compiler, dst_freg); + fprintf(compiler->verbose, ", "); + sljit_verbose_freg(compiler, src1_freg); + fprintf(compiler->verbose, ", "); + sljit_verbose_freg(compiler, src2_freg); + fprintf(compiler->verbose, "\n"); + } +#endif + CHECK_RETURN_OK; +} + static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset) { /* Any offset is allowed. */ @@ -3232,7 +3287,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler #endif /* !SLJIT_CONFIG_ARM_64 && !SLJIT_CONFIG_PPC */ #if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \ - && !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) + && !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \ + && !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, @@ -3325,6 +3381,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c return SLJIT_ERR_UNSUPPORTED; } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + SLJIT_UNUSED_ARG(compiler); + SLJIT_UNUSED_ARG(type); + SLJIT_UNUSED_ARG(dst_freg); + SLJIT_UNUSED_ARG(src1_freg); + SLJIT_UNUSED_ARG(src2_freg); + + return SLJIT_ERR_UNSUPPORTED; +} + #endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_ARM */ #if !(defined(SLJIT_CONFIG_X86) && SLJIT_CONFIG_X86) \ diff --git a/src/sljit/sljitLir.h b/src/sljit/sljitLir.h index 95d59e48..2ba6683c 100644 --- a/src/sljit/sljitLir.h +++ b/src/sljit/sljitLir.h @@ -1869,6 +1869,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler #define SLJIT_SIMD_ELEM_64 (3 << 18) /* Element size is 128 bit long */ #define SLJIT_SIMD_ELEM_128 (4 << 18) +/* Element size is 256 bit long */ +#define SLJIT_SIMD_ELEM_256 (5 << 18) /* The following options are used by sljit_emit_simd_mov(). */ @@ -2039,6 +2041,32 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c sljit_s32 freg, sljit_s32 dst, sljit_sw dstw); +/* The following options are used by sljit_emit_simd_op2(). */ + +/* Binary 'and' operation */ +#define SLJIT_SIMD_OP2_AND 0x000001 +/* Binary 'or' operation */ +#define SLJIT_SIMD_OP2_OR 0x000002 +/* Binary 'xor' operation */ +#define SLJIT_SIMD_OP2_XOR 0x000003 + +/* Perform simd operations using simd registers. + + If the operation is not supported, it returns with + SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed, + it does not emit any instructions. + + type must be a combination of SLJIT_SIMD_* and SLJIT_SIMD_OP2_ + options except SLJIT_SIMD_LOAD and SLJIT_SIMD_STORE + dst_freg is the destination register of the operation + src1_freg is the first source register of the operation + src1_freg is the second source register of the operation + + Flags: - (does not modify flags) */ + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg); + /* The sljit_emit_atomic_load and sljit_emit_atomic_store operation pair can perform an atomic read-modify-write operation. First, an unsigned value must be loaded from memory using sljit_emit_atomic_load. Then, diff --git a/src/sljit/sljitNativeARM_32.c b/src/sljit/sljitNativeARM_32.c index 55c62b79..d44616d8 100644 --- a/src/sljit/sljitNativeARM_32.c +++ b/src/sljit/sljitNativeARM_32.c @@ -49,8 +49,8 @@ typedef sljit_u32 sljit_ins; #define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3) #define TMP_PC (SLJIT_NUMBER_OF_REGISTERS + 4) -#define TMP_FREG1 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1) -#define TMP_FREG2 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2) +#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) +#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) /* In ARM instruction words. Cache lines are usually 32 byte aligned. */ @@ -67,18 +67,20 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = { 0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15 }; -static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = { +static const sljit_u8 freg_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = { 0, 0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8, - 6, 7 + 7, 6 }; -static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = { +static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0 + 1, 1 }; #define RM(rm) ((sljit_ins)reg_map[rm]) @@ -144,6 +146,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] #define UXTH 0xe6ff0070 #define VABS_F32 0xeeb00ac0 #define VADD_F32 0xee300a00 +#define VAND 0xf2000110 #define VCMP_F32 0xeeb40a40 #define VCVT_F32_S32 0xeeb80ac0 #define VCVT_F32_U32 0xeeb80a40 @@ -152,6 +155,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] #define VDIV_F32 0xee800a00 #define VDUP 0xee800b10 #define VDUP_s 0xf3b00c00 +#define VEOR 0xf3000110 #define VLD1 0xf4200000 #define VLD1_r 0xf4a00c00 #define VLD1_s 0xf4a00000 @@ -190,11 +194,12 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s if (compiler->scratches == -1) return 0; - if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0)) - fr -= SLJIT_FS0; + if (is_32 && fr >= SLJIT_F64_SECOND(SLJIT_FR0)) + fr -= SLJIT_F64_SECOND(0); return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches)) - || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0); + || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0) + || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)); } #endif /* SLJIT_ARGUMENT_CHECKS */ @@ -3776,7 +3781,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type); + sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); sljit_ins ins; CHECK_ERROR(); @@ -3806,7 +3811,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co ins = VD(freg) | VN(srcdst) | VM(srcdst); if (reg_size == 4) - ins |= (1 << 6); + ins |= (sljit_ins)1 << 6; return push_inst(compiler, VORR | ins); } @@ -3973,7 +3978,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil ins = (sljit_ins)(elem_size << 6); if (reg_size == 4) - ins |= 1 << 5; + ins |= (sljit_ins)1 << 5; return push_inst(compiler, VLD1_r | ins | VD(freg) | RN(src) | 0xf); } @@ -3983,7 +3988,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil ins = ((sljit_ins)freg_ebit_map[src] << (16 + 2 + 1)) | ((sljit_ins)1 << (16 + 2)); if (reg_size == 4) - ins |= 1 << 6; + ins |= (sljit_ins)1 << 6; return push_inst(compiler, VDUP_s | ins | VD(freg) | (sljit_ins)freg_map[src]); } @@ -4018,7 +4023,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil } if (reg_size == 4) - ins |= 1 << 21; + ins |= (sljit_ins)1 << 21; return push_inst(compiler, VDUP | ins | VN(freg) | RD(src)); } @@ -4064,8 +4069,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) { - FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG1) | VN(freg) | VM(freg))); - srcdst = TMP_FREG1; + FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG2) | VN(freg) | VM(freg))); + srcdst = TMP_FREG2; srcdstw = 0; } } @@ -4184,7 +4189,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c ins = ((((sljit_ins)src_lane_index << 1) | 1) << (16 + elem_size)); if (reg_size == 4) - ins |= 1 << 6; + ins |= (sljit_ins)1 << 6; return push_inst(compiler, VDUP_s | ins | VD(freg) | VM(src)); } @@ -4226,7 +4231,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler src = simd_get_quad_reg_index(src); if (!(type & SLJIT_SIMD_FLOAT)) { - dst_reg = (reg_size == 4) ? freg : TMP_FREG1; + dst_reg = (reg_size == 4) ? freg : TMP_FREG2; do { FAIL_IF(push_inst(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 24)) @@ -4234,8 +4239,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler src = dst_reg; } while (++elem_size < elem2_size); - if (dst_reg == TMP_FREG1) - return push_inst(compiler, VORR | VD(freg) | VN(TMP_FREG1) | VM(TMP_FREG1)); + if (dst_reg == TMP_FREG2) + return push_inst(compiler, VORR | VD(freg) | VN(TMP_FREG2) | VM(TMP_FREG2)); return SLJIT_SUCCESS; } @@ -4298,30 +4303,30 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c if (reg_size == 4) { freg = simd_get_quad_reg_index(freg); - ins |= (1 << 6); + ins |= (sljit_ins)1 << 6; } - SLJIT_ASSERT((freg_map[TMP_FREG1] & 0x1) == 0); - FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG1) | VM(freg))); + SLJIT_ASSERT((freg_map[TMP_FREG2] & 0x1) == 0); + FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG2) | VM(freg))); if (reg_size == 4 && elem_size > 0) - FAIL_IF(push_inst(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD(TMP_FREG1) | VM(TMP_FREG1))); + FAIL_IF(push_inst(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD(TMP_FREG2) | VM(TMP_FREG2))); ins = (reg_size == 4 && elem_size == 0) ? (1 << 6) : 0; while (imms >= 0x100) { - FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | ((imms & 0xff) << 16) | VD(TMP_FREG1) | VM(TMP_FREG1))); + FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | ((imms & 0xff) << 16) | VD(TMP_FREG2) | VM(TMP_FREG2))); imms >>= 8; } - FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | (1 << 7) | (imms << 16) | VD(TMP_FREG1) | VM(TMP_FREG1))); + FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | (1 << 7) | (imms << 16) | VD(TMP_FREG2) | VM(TMP_FREG2))); dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; - FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(dst_r) | VN(TMP_FREG1))); + FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(dst_r) | VN(TMP_FREG2))); if (reg_size == 4 && elem_size == 0) { - SLJIT_ASSERT(freg_map[TMP_FREG1] + 1 == freg_map[TMP_FREG2]); - FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(TMP_REG2) | VN(TMP_FREG2))); + SLJIT_ASSERT(freg_map[TMP_FREG2] + 1 == freg_map[TMP_FREG1]); + FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(TMP_REG2) | VN(TMP_FREG1))); FAIL_IF(push_inst(compiler, ORR | RD(dst_r) | RN(dst_r) | RM(TMP_REG2) | (0x8 << 7))); } @@ -4331,6 +4336,47 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c return SLJIT_SUCCESS; } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins ins = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + + if (reg_size != 3 && reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + ins = VAND; + break; + case SLJIT_SIMD_OP2_OR: + ins = VORR; + break; + case SLJIT_SIMD_OP2_XOR: + ins = VEOR; + break; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 4) { + dst_freg = simd_get_quad_reg_index(dst_freg); + src1_freg = simd_get_quad_reg_index(src1_freg); + src2_freg = simd_get_quad_reg_index(src2_freg); + ins |= (sljit_ins)1 << 6; + } + + return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg)); +} + #undef FPU_LOAD SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, diff --git a/src/sljit/sljitNativeARM_64.c b/src/sljit/sljitNativeARM_64.c index 8194670f..b268582f 100644 --- a/src/sljit/sljitNativeARM_64.c +++ b/src/sljit/sljitNativeARM_64.c @@ -67,121 +67,123 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { /* Instrucion forms */ /* --------------------------------------------------------------------- */ -#define ADC 0x9a000000 -#define ADD 0x8b000000 -#define ADDE 0x8b200000 -#define ADDI 0x91000000 -#define AND 0x8a000000 -#define ANDI 0x92000000 -#define ASRV 0x9ac02800 -#define B 0x14000000 -#define B_CC 0x54000000 -#define BL 0x94000000 -#define BLR 0xd63f0000 -#define BR 0xd61f0000 -#define BRK 0xd4200000 -#define CAS 0xc8a07c00 -#define CASB 0x08a07c00 -#define CASH 0x48a07c00 -#define CBZ 0xb4000000 -#define CCMPI 0xfa400800 -#define CLZ 0xdac01000 -#define CSEL 0x9a800000 -#define CSINC 0x9a800400 -#define DUP_e 0x0e000400 -#define DUP_g 0x0e000c00 -#define EOR 0xca000000 -#define EORI 0xd2000000 -#define EXTR 0x93c00000 -#define FABS 0x1e60c000 -#define FADD 0x1e602800 -#define FCMP 0x1e602000 -#define FCSEL 0x1e600c00 -#define FCVT 0x1e224000 -#define FCVTL 0x0e217800 -#define FCVTZS 0x9e780000 -#define FDIV 0x1e601800 -#define FMOV 0x1e604000 -#define FMOV_R 0x9e660000 -#define FMOV_I 0x1e601000 -#define FMUL 0x1e600800 -#define FNEG 0x1e614000 -#define FSUB 0x1e603800 -#define INS 0x4e001c00 -#define INS_e 0x6e000400 -#define LD1 0x0c407000 -#define LD1_s 0x0d400000 -#define LD1R 0x0d40c000 -#define LDRI 0xf9400000 -#define LDRI_F64 0xfd400000 -#define LDRI_POST 0xf8400400 -#define LDP 0xa9400000 -#define LDP_F64 0x6d400000 -#define LDP_POST 0xa8c00000 -#define LDR_PRE 0xf8400c00 -#define LDXR 0xc85f7c00 -#define LDXRB 0x085f7c00 -#define LDXRH 0x485f7c00 -#define LSLV 0x9ac02000 -#define LSRV 0x9ac02400 -#define MADD 0x9b000000 -#define MOVI 0x0f000400 -#define MOVK 0xf2800000 -#define MOVN 0x92800000 -#define MOVZ 0xd2800000 -#define NOP 0xd503201f -#define ORN 0xaa200000 -#define ORR 0xaa000000 -#define ORR_v 0x0ea01c00 -#define ORRI 0xb2000000 -#define RBIT 0xdac00000 -#define RET 0xd65f0000 -#define REV 0xdac00c00 -#define REV16 0xdac00400 -#define RORV 0x9ac02c00 -#define SBC 0xda000000 -#define SBFM 0x93400000 -#define SCVTF 0x9e620000 -#define SDIV 0x9ac00c00 -#define SMADDL 0x9b200000 -#define SMOV 0x0e002c00 -#define SMULH 0x9b403c00 -#define SSHLL 0x0f00a400 -#define ST1 0x0c007000 -#define ST1_s 0x0d000000 -#define STP 0xa9000000 -#define STP_F64 0x6d000000 -#define STP_PRE 0xa9800000 -#define STRB 0x38206800 -#define STRBI 0x39000000 -#define STRI 0xf9000000 -#define STRI_F64 0xfd000000 -#define STR_FI 0x3d000000 -#define STR_FR 0x3c206800 -#define STUR_FI 0x3c000000 -#define STURBI 0x38000000 -#define STXR 0xc8007c00 -#define STXRB 0x8007c00 -#define STXRH 0x48007c00 -#define SUB 0xcb000000 -#define SUBI 0xd1000000 -#define SUBS 0xeb000000 -#define TBZ 0x36000000 -#define UBFM 0xd3400000 -#define UCVTF 0x9e630000 -#define UDIV 0x9ac00800 -#define UMOV 0x0e003c00 -#define UMULH 0x9bc03c00 -#define USHLL 0x2f00a400 -#define USHR 0x2f000400 -#define USRA 0x2f001400 -#define XTN 0x0e212800 - -#define CSET (CSINC | RM(TMP_ZERO) | RN(TMP_ZERO)) -#define LDR (STRI | (1 << 22)) -#define LDRB (STRBI | (1 << 22)) -#define LDRH (LDRB | (1 << 30)) -#define MOV (ORR | RN(TMP_ZERO)) +#define ADC 0x9a000000 +#define ADD 0x8b000000 +#define ADDE 0x8b200000 +#define ADDI 0x91000000 +#define AND 0x8a000000 +#define ANDI 0x92000000 +#define AND_v 0x0e201c00 +#define ASRV 0x9ac02800 +#define B 0x14000000 +#define B_CC 0x54000000 +#define BL 0x94000000 +#define BLR 0xd63f0000 +#define BR 0xd61f0000 +#define BRK 0xd4200000 +#define CAS 0xc8a07c00 +#define CASB 0x08a07c00 +#define CASH 0x48a07c00 +#define CBZ 0xb4000000 +#define CCMPI 0xfa400800 +#define CLZ 0xdac01000 +#define CSEL 0x9a800000 +#define CSINC 0x9a800400 +#define DUP_e 0x0e000400 +#define DUP_g 0x0e000c00 +#define EOR 0xca000000 +#define EOR_v 0x2e201c00 +#define EORI 0xd2000000 +#define EXTR 0x93c00000 +#define FABS 0x1e60c000 +#define FADD 0x1e602800 +#define FCMP 0x1e602000 +#define FCSEL 0x1e600c00 +#define FCVT 0x1e224000 +#define FCVTL 0x0e217800 +#define FCVTZS 0x9e780000 +#define FDIV 0x1e601800 +#define FMOV 0x1e604000 +#define FMOV_R 0x9e660000 +#define FMOV_I 0x1e601000 +#define FMUL 0x1e600800 +#define FNEG 0x1e614000 +#define FSUB 0x1e603800 +#define INS 0x4e001c00 +#define INS_e 0x6e000400 +#define LD1 0x0c407000 +#define LD1_s 0x0d400000 +#define LD1R 0x0d40c000 +#define LDRI 0xf9400000 +#define LDRI_F64 0xfd400000 +#define LDRI_POST 0xf8400400 +#define LDP 0xa9400000 +#define LDP_F64 0x6d400000 +#define LDP_POST 0xa8c00000 +#define LDR_PRE 0xf8400c00 +#define LDXR 0xc85f7c00 +#define LDXRB 0x085f7c00 +#define LDXRH 0x485f7c00 +#define LSLV 0x9ac02000 +#define LSRV 0x9ac02400 +#define MADD 0x9b000000 +#define MOVI 0x0f000400 +#define MOVK 0xf2800000 +#define MOVN 0x92800000 +#define MOVZ 0xd2800000 +#define NOP 0xd503201f +#define ORN 0xaa200000 +#define ORR 0xaa000000 +#define ORR_v 0x0ea01c00 +#define ORRI 0xb2000000 +#define RBIT 0xdac00000 +#define RET 0xd65f0000 +#define REV 0xdac00c00 +#define REV16 0xdac00400 +#define RORV 0x9ac02c00 +#define SBC 0xda000000 +#define SBFM 0x93400000 +#define SCVTF 0x9e620000 +#define SDIV 0x9ac00c00 +#define SMADDL 0x9b200000 +#define SMOV 0x0e002c00 +#define SMULH 0x9b403c00 +#define SSHLL 0x0f00a400 +#define ST1 0x0c007000 +#define ST1_s 0x0d000000 +#define STP 0xa9000000 +#define STP_F64 0x6d000000 +#define STP_PRE 0xa9800000 +#define STRB 0x38206800 +#define STRBI 0x39000000 +#define STRI 0xf9000000 +#define STRI_F64 0xfd000000 +#define STR_FI 0x3d000000 +#define STR_FR 0x3c206800 +#define STUR_FI 0x3c000000 +#define STURBI 0x38000000 +#define STXR 0xc8007c00 +#define STXRB 0x8007c00 +#define STXRH 0x48007c00 +#define SUB 0xcb000000 +#define SUBI 0xd1000000 +#define SUBS 0xeb000000 +#define TBZ 0x36000000 +#define UBFM 0xd3400000 +#define UCVTF 0x9e630000 +#define UDIV 0x9ac00800 +#define UMOV 0x0e003c00 +#define UMULH 0x9bc03c00 +#define USHLL 0x2f00a400 +#define USHR 0x2f000400 +#define USRA 0x2f001400 +#define XTN 0x0e212800 + +#define CSET (CSINC | RM(TMP_ZERO) | RN(TMP_ZERO)) +#define LDR (STRI | (1 << 22)) +#define LDRB (STRBI | (1 << 22)) +#define LDRH (LDRB | (1 << 30)) +#define MOV (ORR | RN(TMP_ZERO)) static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins) { @@ -3044,6 +3046,43 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c return SLJIT_SUCCESS; } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins ins = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + + if (reg_size != 3 && reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + ins = AND_v; + break; + case SLJIT_SIMD_OP2_OR: + ins = ORR_v; + break; + case SLJIT_SIMD_OP2_XOR: + ins = EOR_v; + break; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 4) + ins |= (sljit_ins)1 << 30; + + return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg)); +} + SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst_reg, sljit_s32 mem_reg) diff --git a/src/sljit/sljitNativeARM_T2_32.c b/src/sljit/sljitNativeARM_T2_32.c index f914eb70..c27c50dd 100644 --- a/src/sljit/sljitNativeARM_T2_32.c +++ b/src/sljit/sljitNativeARM_T2_32.c @@ -41,26 +41,28 @@ typedef sljit_u32 sljit_ins; #define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3) #define TMP_PC (SLJIT_NUMBER_OF_REGISTERS + 4) -#define TMP_FREG1 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1) -#define TMP_FREG2 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2) +#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) +#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) /* See sljit_emit_enter and sljit_emit_op0 if you want to change them. */ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = { 0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15 }; -static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = { +static const sljit_u8 freg_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = { 0, 0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8, - 6, 7 + 7, 6 }; -static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = { +static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0 + 1, 1 }; #define COPY_BITS(src, from, to, bits) \ @@ -217,6 +219,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] #define UXTH_W 0xfa1ff080 #define VABS_F32 0xeeb00ac0 #define VADD_F32 0xee300a00 +#define VAND 0xef000110 #define VCMP_F32 0xeeb40a40 #define VCVT_F32_S32 0xeeb80ac0 #define VCVT_F32_U32 0xeeb80a40 @@ -225,6 +228,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] #define VDIV_F32 0xee800a00 #define VDUP 0xee800b10 #define VDUP_s 0xffb00c00 +#define VEOR 0xff000110 #define VLD1 0xf9200000 #define VLD1_r 0xf9a00c00 #define VLD1_s 0xf9a00000 @@ -256,11 +260,12 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s if (compiler->scratches == -1) return 0; - if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0)) - fr -= SLJIT_FS0; + if (is_32 && fr >= SLJIT_F64_SECOND(SLJIT_FR0)) + fr -= SLJIT_F64_SECOND(0); return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches)) - || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0); + || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0) + || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)); } #endif /* SLJIT_ARGUMENT_CHECKS */ @@ -3426,7 +3431,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type); + sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); sljit_ins ins; CHECK_ERROR(); @@ -3456,7 +3461,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co ins = VD4(freg) | VN4(srcdst) | VM4(srcdst); if (reg_size == 4) - ins |= (1 << 6); + ins |= (sljit_ins)1 << 6; return push_inst32(compiler, VORR | ins); } @@ -3633,7 +3638,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil ins = ((sljit_ins)freg_ebit_map[src] << (16 + 2 + 1)) | ((sljit_ins)1 << (16 + 2)); if (reg_size == 4) - ins |= 1 << 6; + ins |= (sljit_ins)1 << 6; return push_inst32(compiler, VDUP_s | ins | VD4(freg) | (sljit_ins)freg_map[src]); } @@ -3668,7 +3673,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil } if (reg_size == 4) - ins |= 1 << 21; + ins |= (sljit_ins)1 << 21; return push_inst32(compiler, VDUP | ins | VN4(freg) | RT4(src)); } @@ -3714,8 +3719,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile } if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) { - FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG1) | VN4(freg) | VM4(freg))); - srcdst = TMP_FREG1; + FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG2) | VN4(freg) | VM4(freg))); + srcdst = TMP_FREG2; srcdstw = 0; } } @@ -3834,7 +3839,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c ins = ((((sljit_ins)src_lane_index << 1) | 1) << (16 + elem_size)); if (reg_size == 4) - ins |= 1 << 6; + ins |= (sljit_ins)1 << 6; return push_inst32(compiler, VDUP_s | ins | VD4(freg) | VM4(src)); } @@ -3876,7 +3881,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler src = simd_get_quad_reg_index(src); if (!(type & SLJIT_SIMD_FLOAT)) { - dst_reg = (reg_size == 4) ? freg : TMP_FREG1; + dst_reg = (reg_size == 4) ? freg : TMP_FREG2; do { FAIL_IF(push_inst32(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 28)) @@ -3884,8 +3889,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler src = dst_reg; } while (++elem_size < elem2_size); - if (dst_reg == TMP_FREG1) - return push_inst32(compiler, VORR | VD4(freg) | VN4(TMP_FREG1) | VM4(TMP_FREG1)); + if (dst_reg == TMP_FREG2) + return push_inst32(compiler, VORR | VD4(freg) | VN4(TMP_FREG2) | VM4(TMP_FREG2)); return SLJIT_SUCCESS; } @@ -3948,30 +3953,30 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c if (reg_size == 4) { freg = simd_get_quad_reg_index(freg); - ins |= (1 << 6); + ins |= (sljit_ins)1 << 6; } - SLJIT_ASSERT((freg_map[TMP_FREG1] & 0x1) == 0); - FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG1) | VM4(freg))); + SLJIT_ASSERT((freg_map[TMP_FREG2] & 0x1) == 0); + FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG2) | VM4(freg))); if (reg_size == 4 && elem_size > 0) - FAIL_IF(push_inst32(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD4(TMP_FREG1) | VM4(TMP_FREG1))); + FAIL_IF(push_inst32(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD4(TMP_FREG2) | VM4(TMP_FREG2))); ins = (reg_size == 4 && elem_size == 0) ? (1 << 6) : 0; while (imms >= 0x100) { - FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | ((imms & 0xff) << 16) | VD4(TMP_FREG1) | VM4(TMP_FREG1))); + FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | ((imms & 0xff) << 16) | VD4(TMP_FREG2) | VM4(TMP_FREG2))); imms >>= 8; } - FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | (1 << 7) | (imms << 16) | VD4(TMP_FREG1) | VM4(TMP_FREG1))); + FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | (1 << 7) | (imms << 16) | VD4(TMP_FREG2) | VM4(TMP_FREG2))); dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; - FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(dst_r) | VN4(TMP_FREG1))); + FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(dst_r) | VN4(TMP_FREG2))); if (reg_size == 4 && elem_size == 0) { - SLJIT_ASSERT(freg_map[TMP_FREG1] + 1 == freg_map[TMP_FREG2]); - FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(TMP_REG2) | VN4(TMP_FREG2))); + SLJIT_ASSERT(freg_map[TMP_FREG2] + 1 == freg_map[TMP_FREG1]); + FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(TMP_REG2)| VN4(TMP_FREG1))); FAIL_IF(push_inst32(compiler, ORR_W | RD4(dst_r) | RN4(dst_r) | RM4(TMP_REG2) | (0x2 << 12))); } @@ -3981,6 +3986,47 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c return SLJIT_SUCCESS; } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins ins = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + + if (reg_size != 3 && reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + ins = VAND; + break; + case SLJIT_SIMD_OP2_OR: + ins = VORR; + break; + case SLJIT_SIMD_OP2_XOR: + ins = VEOR; + break; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (reg_size == 4) { + dst_freg = simd_get_quad_reg_index(dst_freg); + src1_freg = simd_get_quad_reg_index(src1_freg); + src2_freg = simd_get_quad_reg_index(src2_freg); + ins |= (sljit_ins)1 << 6; + } + + return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2_freg)); +} + #undef FPU_LOAD SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, diff --git a/src/sljit/sljitNativeLOONGARCH_64.c b/src/sljit/sljitNativeLOONGARCH_64.c index fe6fad47..dbd76054 100644 --- a/src/sljit/sljitNativeLOONGARCH_64.c +++ b/src/sljit/sljitNativeLOONGARCH_64.c @@ -26,9 +26,7 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void) { -#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64) return "LOONGARCH" SLJIT_CPUINFO; -#endif /* SLJIT_CONFIG_LOONGARCH_64 */ } typedef sljit_u32 sljit_ins; @@ -61,7 +59,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = { /* LoongArch instructions are 32 bits wide, belonging to 9 basic instruction formats (and variants of them): -| Format name | Composition | +| Format name | Composition | | 2R | Opcode + Rj + Rd | | 3R | Opcode + Rk + Rj + Rd | | 4R | Opcode + Ra + Rk + Rj + Rd | @@ -338,10 +336,10 @@ lower parts in the instruction word, denoted by the “L” and “H” suffixes #define INST(inst, type) ((sljit_ins)((type & SLJIT_32) ? inst##_W : inst##_D)) /* LoongArch CPUCFG register for feature detection */ -#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG2 0x02 #define LOONGARCH_FEATURE_LAMCAS (1 << 28) -sljit_u32 cpu_feature_list = 0; +static sljit_u32 cpu_feature_list = 0; static SLJIT_INLINE sljit_u32 get_cpu_features(void) { @@ -1610,9 +1608,10 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3 compiler->cache_argw = 0; } - if (dst == TMP_REG2) { + if (dst == 0) { SLJIT_ASSERT(HAS_FLAGS(op)); flags |= UNUSED_DEST; + dst = TMP_REG2; } else if (FAST_IS_REG(dst)) { dst_r = dst; @@ -1891,7 +1890,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w)); SLJIT_SKIP_CHECKS(compiler); - return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w); + return sljit_emit_op2(compiler, op, 0, 0, src1, src1w, src2, src2w); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op, @@ -2392,14 +2391,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op, - sljit_s32 dst, + sljit_s32 dst_freg, sljit_s32 src1, sljit_sw src1w, sljit_s32 src2, sljit_sw src2w) { sljit_s32 reg; CHECK_ERROR(); - CHECK(check_sljit_emit_fop2r(compiler, op, dst, src1, src1w, src2, src2w)); + CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w)); ADJUST_LOCAL_OFFSET(src1, src1w); ADJUST_LOCAL_OFFSET(src2, src2w); @@ -2409,12 +2408,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compi } if (src1 & SLJIT_MEM) { - reg = (dst == src2) ? TMP_FREG1 : dst; + reg = (dst_freg == src2) ? TMP_FREG1 : dst_freg; FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, reg, src1, src1w, 0, 0)); src1 = reg; } - return push_inst(compiler, FINST(FCOPYSIGN, op) | FRD(dst) | FRJ(src1) | FRK(src2)); + return push_inst(compiler, FINST(FCOPYSIGN, op) | FRD(dst_freg) | FRJ(src1) | FRK(src2)); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset32(struct sljit_compiler *compiler, diff --git a/src/sljit/sljitNativeMIPS_common.c b/src/sljit/sljitNativeMIPS_common.c index d80a75c2..807b3474 100644 --- a/src/sljit/sljitNativeMIPS_common.c +++ b/src/sljit/sljitNativeMIPS_common.c @@ -94,29 +94,26 @@ typedef sljit_u32 sljit_ins; #define EQUAL_FLAG 3 #define OTHER_FLAG 1 -static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = { - 0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 23, 22, 21, 20, 19, 18, 17, 16, 29, 4, 25, 31 +static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = { + 0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 23, 22, 21, 20, 19, 18, 17, 16, 29, 4, 25, 31, 3, 1 }; -#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) +#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) +#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) +#define TMP_FREG3 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3) -#define TMP_FREG1 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1) -#define TMP_FREG2 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2) -#define TMP_FREG3 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3) +#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32) -static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 4] = { +static const sljit_u8 freg_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3) << 1) + 1] = { 0, 0, 14, 2, 4, 6, 8, 18, 30, 28, 26, 24, 22, 20, - 1, 15, 3, 5, 7, 9, 19, 31, 29, 27, 25, 23, 21, 12, 10, 16, + 1, 15, 3, 5, 7, 9, 19, 31, 29, 27, 25, 23, 21, + 13, 11, 17 }; #else /* !SLJIT_CONFIG_MIPS_32 */ -#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) -#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) -#define TMP_FREG3 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3) - static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = { 0, 0, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8, 9, 31, 30, 29, 28, 27, 26, 25, 24, 12, 11, 10 }; @@ -381,11 +378,12 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s if (compiler->scratches == -1) return 0; - if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0)) - fr -= SLJIT_FS0; + if (is_32 && fr >= SLJIT_F64_SECOND(SLJIT_FR0)) + fr -= SLJIT_F64_SECOND(0); return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches)) - || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0); + || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0) + || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)); } #endif /* SLJIT_CONFIG_MIPS_32 && SLJIT_ARGUMENT_CHECKS */ @@ -2292,9 +2290,10 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3 compiler->cache_argw = 0; } - if (dst == TMP_REG2) { + if (dst == 0) { SLJIT_ASSERT(HAS_FLAGS(op)); flags |= UNUSED_DEST; + dst = TMP_REG2; } else if (FAST_IS_REG(dst)) { dst_r = dst; @@ -2655,7 +2654,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w)); SLJIT_SKIP_CHECKS(compiler); - return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w); + return sljit_emit_op2(compiler, op, 0, 0, src1, src1w, src2, src2w); } #if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64) diff --git a/src/sljit/sljitNativeRISCV_32.c b/src/sljit/sljitNativeRISCV_32.c index 68b2a349..396c956c 100644 --- a/src/sljit/sljitNativeRISCV_32.c +++ b/src/sljit/sljitNativeRISCV_32.c @@ -27,7 +27,6 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r, sljit_sw imm, sljit_s32 tmp_r) { SLJIT_UNUSED_ARG(tmp_r); - SLJIT_ASSERT(dst_r != tmp_r); if (imm <= SIMM_MAX && imm >= SIMM_MIN) return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm)); diff --git a/src/sljit/sljitNativeRISCV_64.c b/src/sljit/sljitNativeRISCV_64.c index 18c2d59f..7fcf2c52 100644 --- a/src/sljit/sljitNativeRISCV_64.c +++ b/src/sljit/sljitNativeRISCV_64.c @@ -28,8 +28,6 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r { sljit_sw high; - SLJIT_ASSERT(dst_r != tmp_r); - if (imm <= SIMM_MAX && imm >= SIMM_MIN) return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm)); @@ -81,6 +79,8 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r return SLJIT_SUCCESS; } + SLJIT_ASSERT(dst_r != tmp_r); + high = imm >> 32; imm = (sljit_s32)imm; diff --git a/src/sljit/sljitNativeRISCV_common.c b/src/sljit/sljitNativeRISCV_common.c index 3b54ab92..64bd411d 100644 --- a/src/sljit/sljitNativeRISCV_common.c +++ b/src/sljit/sljitNativeRISCV_common.c @@ -348,13 +348,12 @@ static SLJIT_INLINE void load_addr_to_reg(void *dst, sljit_u32 reg) if ((addr & 0x80000000l) != 0) high = ~high; - if ((high & 0x800) != 0) - high += 0x1000; - if (flags & PATCH_ABS52) { SLJIT_ASSERT(addr <= S52_MAX); inst[0] = LUI | RD(TMP_REG3) | (sljit_ins)(high << 12); } else { + if ((high & 0x800) != 0) + high += 0x1000; inst[0] = LUI | RD(TMP_REG3) | (sljit_ins)(high & ~0xfff); inst[1] = ADDI | RD(TMP_REG3) | RS1(TMP_REG3) | IMM_I(high); inst++; @@ -940,7 +939,7 @@ static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sl /* Since tmp can be the same as base or offset registers, * these might be unavailable after modifying tmp. */ - if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA)) + if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA) && reg == TMP_REG2) tmp_r = reg; if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) { @@ -1639,9 +1638,10 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3 compiler->cache_argw = 0; } - if (dst == TMP_REG2) { + if (dst == 0) { SLJIT_ASSERT(HAS_FLAGS(op)); flags |= UNUSED_DEST; + dst = TMP_REG2; } else if (FAST_IS_REG(dst)) { dst_r = dst; @@ -1938,7 +1938,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w)); SLJIT_SKIP_CHECKS(compiler); - return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w); + return sljit_emit_op2(compiler, op, 0, 0, src1, src1w, src2, src2w); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op, diff --git a/src/sljit/sljitNativeS390X.c b/src/sljit/sljitNativeS390X.c index 97521b50..67516f9b 100644 --- a/src/sljit/sljitNativeS390X.c +++ b/src/sljit/sljitNativeS390X.c @@ -47,8 +47,8 @@ static const sljit_ins sljit_ins_const = (sljit_ins)1 << 48; #define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) #define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3) -static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = { - 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1 +static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = { + 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1, 14 }; /* there are also a[2-15] available, but they are slower to access and @@ -83,7 +83,7 @@ static const sljit_gpr r10 = 10; /* reg_map[9] */ static const sljit_gpr r11 = 11; /* reg_map[10] */ static const sljit_gpr r12 = 12; /* reg_map[11]: GOT */ static const sljit_gpr r13 = 13; /* reg_map[12]: Literal Pool pointer */ -static const sljit_gpr r14 = 14; /* reg_map[0]: return address and flag register */ +static const sljit_gpr r14 = 14; /* reg_map[0]: return address */ static const sljit_gpr r15 = 15; /* reg_map[SLJIT_NUMBER_OF_REGISTERS + 1]: stack pointer */ /* WARNING: r12 and r13 shouldn't be used as per ABI recommendation */ @@ -96,20 +96,16 @@ static const sljit_gpr r15 = 15; /* reg_map[SLJIT_NUMBER_OF_REGISTERS + 1]: stac #define tmp0 r0 #define tmp1 r1 -/* TODO(carenas): flags should move to a different register so that - * link register doesn't need to change - */ - /* When reg cannot be unused. */ #define IS_GPR_REG(reg) ((reg > 0) && (reg) <= SLJIT_SP) /* Link register. */ static const sljit_gpr link_r = 14; /* r14 */ -#define TMP_FREG1 (0) +#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) -static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { - 1, 0, 2, 4, 6, 3, 5, 7, 15, 14, 13, 12, 11, 10, 9, 8, +static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 0, 2, 4, 6, 3, 5, 7, 15, 14, 13, 12, 11, 10, 9, 8, 1 }; #define R0A(r) (r) @@ -128,6 +124,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { #define F4(r) (R4A((sljit_ins)freg_map[r])) #define F12(r) (R12A((sljit_ins)freg_map[r])) #define F20(r) (R20A((sljit_ins)freg_map[r])) +#define F28(r) (R28A((sljit_ins)freg_map[r])) +#define F32(r) (R32A((sljit_ins)freg_map[r])) #define F36(r) (R36A((sljit_ins)freg_map[r])) struct sljit_s390x_const { @@ -898,23 +896,17 @@ static sljit_s32 push_load_imm_inst(struct sljit_compiler *compiler, sljit_gpr t if (((sljit_uw)v & ~(sljit_uw)0xffff000000000000) == 0) return push_inst(compiler, llihh(target, (sljit_u16)(v >> 48))); - /* 6 byte instructions (requires extended immediate facility) */ - if (have_eimm()) { - if (is_s32(v)) - return push_inst(compiler, lgfi(target, (sljit_s32)v)); + if (is_s32(v)) + return push_inst(compiler, lgfi(target, (sljit_s32)v)); - if (((sljit_uw)v >> 32) == 0) - return push_inst(compiler, llilf(target, (sljit_u32)v)); + if (((sljit_uw)v >> 32) == 0) + return push_inst(compiler, llilf(target, (sljit_u32)v)); - if (((sljit_uw)v << 32) == 0) - return push_inst(compiler, llihf(target, (sljit_u32)((sljit_uw)v >> 32))); - - FAIL_IF(push_inst(compiler, llilf(target, (sljit_u32)v))); - return push_inst(compiler, iihf(target, (sljit_u32)(v >> 32))); - } + if (((sljit_uw)v << 32) == 0) + return push_inst(compiler, llihf(target, (sljit_u32)((sljit_uw)v >> 32))); - /* TODO(mundaym): instruction sequences that don't use extended immediates */ - abort(); + FAIL_IF(push_inst(compiler, llilf(target, (sljit_u32)v))); + return push_inst(compiler, iihf(target, (sljit_u32)(v >> 32))); } struct addr { @@ -1677,6 +1669,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type) case SLJIT_HAS_PREFETCH: case SLJIT_HAS_COPY_F32: case SLJIT_HAS_COPY_F64: + case SLJIT_HAS_SIMD: case SLJIT_HAS_ATOMIC: return 1; @@ -3955,6 +3948,446 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile return push_inst(compiler, ins | R36A(reg2) | disp_s20((sljit_s32)memw + SSIZE_OF(sw))); } +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 srcdst, sljit_sw srcdstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + struct addr addr; + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); + + ADJUST_LOCAL_OFFSET(srcdst, srcdstw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (!(srcdst & SLJIT_MEM)) { + if (type & SLJIT_SIMD_STORE) + ins = F36(srcdst) | F32(freg); + else + ins = F36(freg) | F32(srcdst); + + return push_inst(compiler, 0xe70000000056 /* vlr */ | ins); + } + + FAIL_IF(make_addr_bx(compiler, &addr, srcdst, srcdstw, tmp1)); + ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + + if (alignment >= 4) + ins |= 4 << 12; + else if (alignment == 3) + ins |= 3 << 12; + + return push_inst(compiler, ((type & SLJIT_SIMD_STORE) ? 0xe7000000000e /* vst */ : 0xe70000000006 /* vl */) | ins); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + struct addr addr; + sljit_gpr reg; + sljit_sw sign_ext; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw)); + + ADJUST_LOCAL_OFFSET(src, srcw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (src & SLJIT_MEM) { + FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1)); + return push_inst(compiler, 0xe70000000005 /* vlrep */ | F36(freg) + | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset) | ((sljit_ins)elem_size << 12)); + } + + if (type & SLJIT_SIMD_FLOAT) { + if (src == SLJIT_IMM) + return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg)); + + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(src) | ((sljit_ins)elem_size << 12)); + } + + if (src == SLJIT_IMM) { + sign_ext = 0x10000; + + switch (elem_size) { + case 0: + srcw &= 0xff; + sign_ext = (sljit_s8)srcw; + break; + case 1: + srcw &= 0xffff; + sign_ext = (sljit_s16)srcw; + break; + case 2: + if ((sljit_s32)srcw == (sljit_s16)srcw) { + srcw &= 0xffff; + sign_ext = (sljit_s16)srcw; + } else + srcw &= 0xffffffff; + break; + default: + if (srcw == (sljit_s16)srcw) { + srcw &= 0xffff; + sign_ext = (sljit_s16)srcw; + } + break; + } + + if (sign_ext != 0x10000) { + if (sign_ext == 0 || sign_ext == -1) + return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg) + | (sign_ext == 0 ? 0 : ((sljit_ins)0xffff << 16))); + + return push_inst(compiler, 0xe70000000045 /* vrepi */ | F36(freg) + | ((sljit_ins)srcw << 16) | ((sljit_ins)elem_size << 12)); + } + + push_load_imm_inst(compiler, tmp0, srcw); + reg = tmp0; + } else + reg = gpr(src); + + FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(reg) | ((sljit_ins)elem_size << 12))); + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(freg) | ((sljit_ins)elem_size << 12)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, sljit_s32 lane_index, + sljit_s32 srcdst, sljit_sw srcdstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + struct addr addr; + sljit_gpr reg; + sljit_ins ins = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw)); + + ADJUST_LOCAL_OFFSET(srcdst, srcdstw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (srcdst & SLJIT_MEM) { + FAIL_IF(make_addr_bx(compiler, &addr, srcdst, srcdstw, tmp1)); + ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + } + + if (type & SLJIT_SIMD_LANE_ZERO) { + if ((srcdst & SLJIT_MEM) && lane_index == ((1 << (3 - elem_size)) - 1)) + return push_inst(compiler, 0xe70000000004 /* vllez */ | ins | ((sljit_ins)elem_size << 12)); + + if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) { + FAIL_IF(push_inst(compiler, 0xe70000000056 /* vlr */ | F36(TMP_FREG1) | F32(freg))); + srcdst = TMP_FREG1; + srcdstw = 0; + } + + FAIL_IF(push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg))); + } + + if (srcdst & SLJIT_MEM) { + switch (elem_size) { + case 0: + ins |= 0xe70000000000 /* vleb */; + break; + case 1: + ins |= 0xe70000000001 /* vleh */; + break; + case 2: + ins |= 0xe70000000003 /* vlef */; + break; + default: + ins |= 0xe70000000002 /* vleg */; + break; + } + + /* Convert to vsteb - vsteg */ + if (type & SLJIT_SIMD_STORE) + ins |= 0x8; + + return push_inst(compiler, ins | ((sljit_ins)lane_index << 12)); + } + + if (type & SLJIT_SIMD_FLOAT) { + if (type & SLJIT_SIMD_STORE) + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(srcdst) | F32(freg) | ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12)); + + if (elem_size == 3) { + if (lane_index == 0) + ins = F32(srcdst) | F28(freg) | (1 << 12); + else + ins = F32(freg) | F28(srcdst); + + return push_inst(compiler, 0xe70000000084 /* vpdi */ | F36(freg) | ins); + } + + FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(tmp0) | F32(srcdst) | ((sljit_ins)2 << 12))); + return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(tmp0) | ((sljit_ins)lane_index << 16) | ((sljit_ins)2 << 12)); + } + + if (srcdst == SLJIT_IMM) { + switch (elem_size) { + case 0: + ins = 0xe70000000040 /* vleib */; + srcdstw &= 0xff; + break; + case 1: + ins = 0xe70000000041 /* vleih */; + srcdstw &= 0xffff; + break; + case 2: + if ((sljit_s32)srcdstw == (sljit_s16)srcdstw) { + srcdstw &= 0xffff; + ins = 0xe70000000043 /* vleif */; + } else + srcdstw &= 0xffffffff; + break; + default: + if (srcdstw == (sljit_s16)srcdstw) { + srcdstw &= 0xffff; + ins = 0xe70000000042 /* vleig */; + } + break; + } + + if (ins != 0) + return push_inst(compiler, ins | F36(freg) | ((sljit_ins)srcdstw << 16) | ((sljit_ins)lane_index << 12)); + + push_load_imm_inst(compiler, tmp0, srcdstw); + reg = tmp0; + } else + reg = gpr(srcdst); + + ins = ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12); + + if (!(type & SLJIT_SIMD_STORE)) + return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(reg) | ins); + + FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(reg) | F32(freg) | ins)); + + if (!(type & SLJIT_SIMD_LANE_SIGNED) || elem_size >= 3) + return SLJIT_SUCCESS; + + switch (elem_size) { + case 0: + ins = 0xb9060000 /* lgbr */; + break; + case 1: + ins = 0xb9070000 /* lghr */; + break; + default: + ins = 0xb9140000 /* lgfr */; + break; + } + + return push_inst(compiler, ins | R4A(reg) | R0A(reg)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 src, sljit_s32 src_lane_index) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index)); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(src) + | ((sljit_ins)src_lane_index << 16) | ((sljit_ins)elem_size << 12)); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 src, sljit_sw srcw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type); + struct addr addr; + sljit_ins ins; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw)); + + ADJUST_LOCAL_OFFSET(src, srcw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + if (src & SLJIT_MEM) { + FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1)); + ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset); + + switch (elem2_size - elem_size) { + case 1: + ins |= 0xe70000000002 /* vleg */; + break; + case 2: + ins |= 0xe70000000003 /* vlef */; + break; + default: + ins |= 0xe70000000001 /* vleh */; + break; + } + + FAIL_IF(push_inst(compiler, ins)); + src = freg; + } + + if (type & SLJIT_SIMD_FLOAT) { + FAIL_IF(push_inst(compiler, 0xe700000000d5 /* vuplh */ | F36(freg) | F32(src) | (2 << 12))); + FAIL_IF(push_inst(compiler, 0xe70000000030 /* vesl */ | F36(freg) | F32(freg) | (32 << 16) | (3 << 12))); + return push_inst(compiler, 0xe700000000c4 /* vfll */ | F36(freg) | F32(freg) | (2 << 12)); + } + + ins = ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0xe700000000d7 /* vuph */ : 0xe700000000d5 /* vuplh */) | F36(freg); + + do { + FAIL_IF(push_inst(compiler, ins | F32(src) | ((sljit_ins)elem_size << 12))); + src = freg; + } while (++elem_size < elem2_size); + + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 freg, + sljit_s32 dst, sljit_sw dstw) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_gpr dst_r; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw)); + + ADJUST_LOCAL_OFFSET(dst, dstw); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + switch (elem_size) { + case 0: + push_load_imm_inst(compiler, tmp0, (sljit_sw)0x4048505860687078); + push_load_imm_inst(compiler, tmp1, (sljit_sw)0x0008101820283038); + FAIL_IF(push_inst(compiler, 0xe70000000062 /* vlvgp */ | F36(TMP_FREG1) | R32A(tmp1) | R28A(tmp0))); + break; + case 1: + push_load_imm_inst(compiler, tmp0, (sljit_sw)0x0010203040506070); + break; + case 2: + push_load_imm_inst(compiler, tmp0, (sljit_sw)0x8080808000204060); + break; + default: + push_load_imm_inst(compiler, tmp0, (sljit_sw)0x8080808080800040); + break; + } + + if (elem_size != 0) + FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(TMP_FREG1) | R32A(tmp0) | (1 << 16) | (3 << 12))); + + FAIL_IF(push_inst(compiler, 0xe70000000085 /* vbperm */ | F36(TMP_FREG1) | F32(freg) | F28(TMP_FREG1))); + + dst_r = FAST_IS_REG(dst) ? gpr(dst) : tmp0; + FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(dst_r) | F32(TMP_FREG1) + | (elem_size == 0 ? ((3 << 16) | (1 << 12)) : (7 << 16)))); + + if (dst_r == tmp0) + return store_word(compiler, tmp0, dst, dstw, type & SLJIT_32); + + return SLJIT_SUCCESS; +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_ins ins = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + + if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + ins = 0xe70000000068 /* vn */; + break; + case SLJIT_SIMD_OP2_OR: + ins = 0xe7000000006a /* vo */; + break; + case SLJIT_SIMD_OP2_XOR: + ins = 0xe7000000006d /* vx */; + break; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + return push_inst(compiler, ins | F36(dst_freg) | F32(src1_freg) | F28(src2_freg)); +} + SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst_reg, sljit_s32 mem_reg) diff --git a/src/sljit/sljitNativeX86_32.c b/src/sljit/sljitNativeX86_32.c index 1bba883c..ba4a1ebb 100644 --- a/src/sljit/sljitNativeX86_32.c +++ b/src/sljit/sljitNativeX86_32.c @@ -148,7 +148,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw else if (!(flags & EX86_SSE2_OP1)) *buf_ptr = U8(reg_map[a] << 3); else - *buf_ptr = U8(a << 3); + *buf_ptr = U8(freg_map[a] << 3); } else { if (a == SLJIT_IMM) { if (imma == 1) @@ -161,7 +161,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw } if (!(b & SLJIT_MEM)) { - *buf_ptr = U8(*buf_ptr | MOD_REG | (!(flags & EX86_SSE2_OP2) ? reg_map[b] : b)); + *buf_ptr = U8(*buf_ptr | MOD_REG | (!(flags & EX86_SSE2_OP2) ? reg_map[b] : freg_map[b])); buf_ptr++; } else if (b & REG_MASK) { reg_map_b = reg_map[b & REG_MASK]; @@ -257,7 +257,7 @@ static sljit_s32 emit_vex_instruction(struct sljit_compiler *compiler, sljit_uw if (op & VEX_256) vex |= 0x4; - vex = U8(vex | ((((op & VEX_SSE2_OPV) ? v : reg_map[v]) ^ 0xf) << 3)); + vex = U8(vex | ((((op & VEX_SSE2_OPV) ? freg_map[v] : reg_map[v]) ^ 0xf) << 3)); size = op & ~(sljit_uw)0xff; size |= (vex_m == 0) ? 3 : 4; @@ -1351,7 +1351,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp FAIL_IF(!inst); inst[1] |= SHR; - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_PREF_F2 | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_PREF_F2 | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); FAIL_IF(!inst); @@ -1359,7 +1359,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp inst[0] = U8(get_jump_code(SLJIT_NOT_CARRY) - 0x10); size1 = compiler->size; - FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_PREF_F2 | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)&f64_high_bit)); + FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_PREF_F2 | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)&f64_high_bit)); inst[1] = U8(compiler->size - size1); @@ -1383,7 +1383,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp size1 = compiler->size; - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0)); inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); FAIL_IF(!inst); @@ -1413,8 +1413,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp BINARY_IMM32(OR, 1, TMP_REG1, 0); jump_inst1[1] = U8(compiler->size - size1); - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); - FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); + FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0)); jump_inst2[1] = U8(compiler->size - size2); @@ -1475,13 +1475,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset64(struct sljit_compiler *comp if (u.imm[0] == 0) { if (u.imm[1] == 0) - return emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0); + return emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0); EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[1]); } else EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[0]); - FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, TMP_REG1, 0)); + FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, TMP_REG1, 0)); if (u.imm[1] == 0) return SLJIT_SUCCESS; @@ -1504,11 +1504,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset64(struct sljit_compiler *comp EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[1]); if (cpu_feature_list & CPU_FEATURE_SSE41) { - FAIL_IF(emit_groupf_ext(compiler, PINSRD_x_rm_i8, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, TMP_REG1, 0)); + FAIL_IF(emit_groupf_ext(compiler, PINSRD_x_rm_i8 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, TMP_REG1, 0)); return emit_byte(compiler, 1); } - FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, TMP_REG1, 0)); + FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, TMP_REG1, 0)); tmp_freg = TMP_FREG; } @@ -1545,15 +1545,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi CHECK_EXTRA_REGS(reg, regw, (void)0); - FAIL_IF(emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x, - EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw)); + FAIL_IF(emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x) + | EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw)); } else reg2 = reg; CHECK_EXTRA_REGS(reg2, reg2w, (void)0); - FAIL_IF(emit_groupf_ext(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? PINSRD_x_rm_i8 : PEXTRD_rm_x_i8, - EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, reg2, reg2w)); + FAIL_IF(emit_groupf_ext(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? PINSRD_x_rm_i8 : PEXTRD_rm_x_i8) + | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, reg2, reg2w)); return emit_byte(compiler, 1); } @@ -1570,8 +1570,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi CHECK_EXTRA_REGS(reg, regw, (void)0); if (op & SLJIT_32) - return emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x, - EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw); + return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x) + | EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw); if (op == SLJIT_COPY_FROM_F64) { inst = (sljit_u8*)ensure_buf(compiler, 1 + 5); @@ -1584,11 +1584,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi inst[3] = U8(MOD_REG | (TMP_FREG << 3) | freg); inst[4] = 1; } else if (reg != 0) - FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw)); + FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw)); if (reg2 != 0) - FAIL_IF(emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x, - EX86_PREF_66 | EX86_SSE2_OP1, freg, reg2, reg2w)); + FAIL_IF(emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x) + | EX86_PREF_66 | EX86_SSE2_OP1, freg, reg2, reg2w)); if (GET_OPCODE(op) == SLJIT_COPY_TO_F64) { inst = (sljit_u8*)ensure_buf(compiler, 1 + 3); @@ -1599,7 +1599,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi inst[1] = UNPCKLPS_x_xm; inst[2] = U8(MOD_REG | (freg << 3) | (reg == 0 ? freg : TMP_FREG)); } else - FAIL_IF(emit_groupf(compiler, MOVD_rm_x, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw)); + FAIL_IF(emit_groupf(compiler, MOVD_rm_x | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw)); return SLJIT_SUCCESS; } diff --git a/src/sljit/sljitNativeX86_64.c b/src/sljit/sljitNativeX86_64.c index 39114c22..f313f3f0 100644 --- a/src/sljit/sljitNativeX86_64.c +++ b/src/sljit/sljitNativeX86_64.c @@ -611,12 +611,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi tmp = SLJIT_FS0 - fsaveds; for (i = SLJIT_FS0; i > tmp; i--) { - FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); + FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); saved_float_regs_offset += 16; } for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) { - FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); + FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); saved_float_regs_offset += 16; } } @@ -674,12 +674,12 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit tmp = SLJIT_FS0 - fsaveds; for (i = SLJIT_FS0; i > tmp; i--) { - FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); + FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); saved_float_regs_offset += 16; } for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) { - FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); + FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset)); saved_float_regs_offset += 16; } @@ -1155,7 +1155,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp } else FAIL_IF(emit_do_imm32(compiler, reg_map[TMP_REG1] <= 7 ? 0 : REX_B, U8(MOV_r_i32 | reg_lmap[TMP_REG1]), srcw)); - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); compiler->mode32 = 1; @@ -1180,7 +1180,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp size1 = compiler->size; compiler->mode32 = 0; - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0)); inst = (sljit_u8*)ensure_buf(compiler, 1 + 2); FAIL_IF(!inst); @@ -1209,9 +1209,9 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp FAIL_IF(!inst); inst[0] = OR_r_rm; - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0)); compiler->mode32 = 1; - FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0)); + FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0)); jump_inst2[1] = U8(compiler->size - size2); diff --git a/src/sljit/sljitNativeX86_common.c b/src/sljit/sljitNativeX86_common.c index cc330d47..369d8285 100644 --- a/src/sljit/sljitNativeX86_common.c +++ b/src/sljit/sljitNativeX86_common.c @@ -61,17 +61,20 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void) 15 - R15 */ -#define TMP_FREG (0) +#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) +#define TMP_FREG (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1) #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) -/* Last register + 1. */ -#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = { 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3 }; +static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 1, 2, 3, 4, 5, 6, 7, 0 +}; + #define CHECK_EXTRA_REGS(p, w, do) \ if (p >= SLJIT_R3 && p <= SLJIT_S3) { \ w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \ @@ -81,8 +84,6 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = { #else /* SLJIT_CONFIG_X86_32 */ -/* Last register + 1. */ -#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2) #define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3) /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present @@ -95,7 +96,7 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = { }; /* low-map. reg_map & 0x7. */ static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = { - 0, 0, 6, 7, 1, 0, 3, 2, 4, 5, 5, 6, 7, 3, 4, 2, 1 + 0, 0, 6, 7, 1, 0, 3, 2, 4, 5, 5, 6, 7, 3, 4, 2, 1 }; #else /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */ @@ -109,12 +110,12 @@ static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = { #endif /* Args: xmm0-xmm3 */ -static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { - 4, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4 }; /* low-map. freg_map & 0x7. */ -static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { - 4, 0, 1, 2, 3, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = { + 0, 0, 1, 2, 3, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4 }; #define REX_W 0x48 @@ -266,8 +267,10 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { #define OR_EAX_i32 0x0d #define OR_rm_r 0x09 #define OR_rm8_r8 0x08 +#define ORPD_x_xm 0x56 #define PACKSSWB_x_xm (/* GROUP_0F */ 0x63) -#define PCMPEQB_x_xm 0x74 +#define PAND_x_xm 0xdb +#define PCMPEQD_x_xm 0x76 #define PINSRB_x_rm_i8 0x20 #define PINSRW_x_rm_i8 0xc4 #define PINSRD_x_rm_i8 0x22 @@ -290,11 +293,14 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = { #define POP_r 0x58 #define POP_rm 0x8f #define POPF 0x9d +#define POR_x_xm 0xeb #define PREFETCH 0x18 #define PSHUFB_x_xm 0x00 #define PSHUFD_x_xm 0x70 #define PSHUFLW_x_xm 0x70 #define PSRLDQ_x 0x73 +#define PSLLD_x_i8 0x72 +#define PSLLQ_x_i8 0x73 #define PUSH_i32 0x68 #define PUSH_r 0x50 #define PUSH_rm (/* GROUP_FF */ 6 << 3) @@ -426,9 +432,11 @@ static void execute_cpu_id(sljit_u32 info[4]) "movl %0, %%esi\n" "movl (%%esi), %%eax\n" "movl 8(%%esi), %%ecx\n" + "pushl %%ebx\n" "cpuid\n" "movl %%eax, (%%esi)\n" "movl %%ebx, 4(%%esi)\n" + "popl %%ebx\n" "movl %%ecx, 8(%%esi)\n" "movl %%edx, 12(%%esi)\n" #else /* !SLJIT_CONFIG_X86_32 */ @@ -444,7 +452,7 @@ static void execute_cpu_id(sljit_u32 info[4]) : : "r" (info) #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - : "memory", "eax", "ebx", "ecx", "edx", "esi" + : "memory", "eax", "ecx", "edx", "esi" #else /* !SLJIT_CONFIG_X86_32 */ : "memory", "rax", "rbx", "rcx", "rdx", "rsi" #endif /* SLJIT_CONFIG_X86_32 */ @@ -937,11 +945,11 @@ static sljit_s32 emit_mov(struct sljit_compiler *compiler, FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw)); static sljit_s32 emit_groupf(struct sljit_compiler *compiler, - sljit_u8 opcode, sljit_uw pref, + sljit_uw op, sljit_s32 dst, sljit_s32 src, sljit_sw srcw); static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler, - sljit_u8 opcode, sljit_uw pref, + sljit_uw op, sljit_s32 dst, sljit_s32 src, sljit_sw srcw); static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler, @@ -1375,7 +1383,7 @@ static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign, #endif /* !SLJIT_CONFIG_X86_32 */ /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */ - FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, 0, dst_r, src, srcw)); + FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, dst_r, src, srcw)); } if (dst & SLJIT_MEM) { @@ -1444,7 +1452,7 @@ static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign, if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) dst_r = src; else - FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, 0, dst_r, src, srcw)); + FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, dst_r, src, srcw)); if (dst & SLJIT_MEM) { inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw); @@ -1506,14 +1514,14 @@ static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz, dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) { - FAIL_IF(emit_groupf(compiler, is_clz ? LZCNT_r_rm : TZCNT_r_rm, EX86_PREF_F3, dst_r, src, srcw)); + FAIL_IF(emit_groupf(compiler, (is_clz ? LZCNT_r_rm : TZCNT_r_rm) | EX86_PREF_F3, dst_r, src, srcw)); if (dst & SLJIT_MEM) EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0); return SLJIT_SUCCESS; } - FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, 0, dst_r, src, srcw)); + FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, dst_r, src, srcw)); #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) max = is_clz ? (32 + 31) : 32; @@ -1546,7 +1554,7 @@ static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz, if (cpu_feature_list & CPU_FEATURE_CMOV) { EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max); - FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, 0, dst_r, TMP_REG2, 0)); + FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, dst_r, TMP_REG2, 0)); } else FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max)); @@ -1991,9 +1999,9 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, /* Register destination. */ if (dst_r == src1 && src2 != SLJIT_IMM) { - FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w)); } else if (dst_r == src2 && src1 != SLJIT_IMM) { - FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src1, src1w)); + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src1, src1w)); } else if (src1 == SLJIT_IMM) { if (src2 == SLJIT_IMM) { EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w); @@ -2032,7 +2040,7 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, if (dst_r != src2) EMIT_MOV(compiler, dst_r, 0, src2, src2w); FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w)); - FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, TMP_REG2, 0)); + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0)); } #endif } @@ -2071,7 +2079,7 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, if (dst_r != src1) EMIT_MOV(compiler, dst_r, 0, src1, src1w); FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w)); - FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, TMP_REG2, 0)); + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0)); } #endif } else { @@ -2079,7 +2087,7 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler, if (ADDRESSING_DEPENDS_ON(src2, dst_r)) dst_r = TMP_REG1; EMIT_MOV(compiler, dst_r, 0, src1, src1w); - FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w)); } if (dst & SLJIT_MEM) @@ -2820,11 +2828,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, slji if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_128 && type != SLJIT_SIMD_REG_256 && type != SLJIT_SIMD_REG_512) return -1; -#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) - return reg; -#else /* !SLJIT_CONFIG_X86_32 */ return freg_map[reg]; -#endif /* SLJIT_CONFIG_X86_32 */ } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler, @@ -2868,42 +2872,42 @@ static void init_compiler(void) } static sljit_s32 emit_groupf(struct sljit_compiler *compiler, - sljit_u8 opcode, sljit_uw pref, + sljit_uw op, sljit_s32 dst, sljit_s32 src, sljit_sw srcw) { - sljit_u8 *inst = emit_x86_instruction(compiler, 2 | pref, dst, 0, src, srcw); + sljit_u8 *inst = emit_x86_instruction(compiler, 2 | (op & ~(sljit_uw)0xff), dst, 0, src, srcw); FAIL_IF(!inst); inst[0] = GROUP_0F; - inst[1] = opcode; + inst[1] = op & 0xff; return SLJIT_SUCCESS; } static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler, - sljit_u8 opcode, sljit_uw pref, + sljit_uw op, sljit_s32 dst, sljit_s32 src, sljit_sw srcw) { sljit_u8 *inst; - SLJIT_ASSERT((pref & EX86_SSE2) && ((pref & VEX_OP_0F38) || (pref & VEX_OP_0F3A))); + SLJIT_ASSERT((op & EX86_SSE2) && ((op & VEX_OP_0F38) || (op & VEX_OP_0F3A))); - inst = emit_x86_instruction(compiler, 3 | (pref & ~(VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw); + inst = emit_x86_instruction(compiler, 3 | (op & ~((sljit_uw)0xff | VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw); FAIL_IF(!inst); inst[0] = GROUP_0F; - inst[1] = U8((pref & VEX_OP_0F38) ? 0x38 : 0x3A); - inst[2] = opcode; + inst[1] = U8((op & VEX_OP_0F38) ? 0x38 : 0x3A); + inst[2] = op & 0xff; return SLJIT_SUCCESS; } static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler, sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw) { - return emit_groupf(compiler, MOVSD_x_xm, (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw); + return emit_groupf(compiler, MOVSD_x_xm | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw); } static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler, sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src) { - return emit_groupf(compiler, MOVSD_xm_x, (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw); + return emit_groupf(compiler, MOVSD_xm_x | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw); } static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op, @@ -2920,7 +2924,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_comp compiler->mode32 = 0; #endif - FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw)); + FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw)); if (dst & SLJIT_MEM) return emit_mov(compiler, dst, dstw, TMP_REG1, 0); @@ -2950,7 +2954,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp srcw = 0; } - FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw)); + FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; @@ -2968,7 +2972,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile case SLJIT_ORDERED_EQUAL: /* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */ FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w)); - FAIL_IF(emit_groupf(compiler, CMPS_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w)); + FAIL_IF(emit_groupf(compiler, CMPS_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w)); /* EQ */ FAIL_IF(emit_byte(compiler, 0)); @@ -2986,7 +2990,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile src2 = TMP_FREG; } - return emit_groupf(compiler, UCOMISD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w); + return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w); } if (!FAST_IS_REG(src1)) { @@ -2994,7 +2998,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile src1 = TMP_FREG; } - return emit_groupf(compiler, UCOMISD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w); + return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op, @@ -3002,6 +3006,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil sljit_s32 src, sljit_sw srcw) { sljit_s32 dst_r; + sljit_u8 *inst; #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; @@ -3025,41 +3030,57 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil /* We overwrite the high bits of source. From SLJIT point of view, this is not an issue. Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */ - FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm, ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0)); + FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0)); } else { FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw)); src = TMP_FREG; } - FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm, ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0)); + FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0)); if (dst_r == TMP_FREG) return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); return SLJIT_SUCCESS; } if (FAST_IS_REG(dst)) { - dst_r = dst; - if (dst != src) - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src, srcw)); - } - else { - dst_r = TMP_FREG; - FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src, srcw)); + dst_r = (dst == src) ? TMP_FREG : dst; + + if (src & SLJIT_MEM) + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw)); + + FAIL_IF(emit_groupf(compiler, PCMPEQD_x_xm | EX86_PREF_66 | EX86_SSE2, dst_r, dst_r, 0)); + + inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, dst_r, 0); + inst[0] = GROUP_0F; + /* Same as PSRLD_x / PSRLQ_x */ + inst[1] = (op & SLJIT_32) ? PSLLD_x_i8 : PSLLQ_x_i8; + + if (GET_OPCODE(op) == SLJIT_ABS_F64) { + inst[2] |= 2 << 3; + FAIL_IF(emit_byte(compiler, 1)); + } else { + inst[2] |= 6 << 3; + FAIL_IF(emit_byte(compiler, ((op & SLJIT_32) ? 31 : 63))); + } + + if (dst_r != TMP_FREG) + dst_r = (src & SLJIT_MEM) ? TMP_FREG : src; + return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_NEG_F64 ? XORPD_x_xm : ANDPD_x_xm) | EX86_SSE2, dst, dst_r, 0); } + FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw)); + switch (GET_OPCODE(op)) { case SLJIT_NEG_F64: - FAIL_IF(emit_groupf(compiler, XORPD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); + FAIL_IF(emit_groupf(compiler, XORPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); break; case SLJIT_ABS_F64: - FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12))); + FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12))); break; } - if (dst_r == TMP_FREG) - return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); - return SLJIT_SUCCESS; + return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op, @@ -3102,19 +3123,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil switch (GET_OPCODE(op)) { case SLJIT_ADD_F64: - FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; case SLJIT_SUB_F64: - FAIL_IF(emit_groupf(compiler, SUBSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, SUBSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; case SLJIT_MUL_F64: - FAIL_IF(emit_groupf(compiler, MULSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, MULSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; case SLJIT_DIV_F64: - FAIL_IF(emit_groupf(compiler, DIVSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); + FAIL_IF(emit_groupf(compiler, DIVSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w)); break; } @@ -3142,9 +3163,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compi if (dst_freg == src1) { FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w)); pref = EX86_SELECT_66(op) | EX86_SSE2; - FAIL_IF(emit_groupf(compiler, XORPD_x_xm, pref, TMP_FREG, src1, src1w)); - FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); - return emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, TMP_FREG, 0); + FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, TMP_FREG, src1, src1w)); + FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); + return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, TMP_FREG, 0); } if (src1 & SLJIT_MEM) { @@ -3157,9 +3178,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compi FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w)); pref = EX86_SELECT_66(op) | EX86_SSE2; - FAIL_IF(emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, src1, src1w)); - FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); - return emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, src1, src1w); + FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w)); + FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8))); + return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w); } /* --------------------------------------------------------------------- */ @@ -3445,7 +3466,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_select(struct sljit_compiler *comp #endif /* SLJIT_CONFIG_X86_32 */ if (sljit_has_cpu_feature(SLJIT_HAS_CMOV)) - FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), 0, dst_reg, src1, src1w)); + FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), dst_reg, src1, src1w)); else FAIL_IF(emit_cmov_generic(compiler, type, dst_reg, src1, src1w)); @@ -3500,9 +3521,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co { sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); - sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type); - sljit_u8 opcode = 0; - sljit_uw pref; + sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type); + sljit_uw op; CHECK_ERROR(); CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw)); @@ -3515,12 +3535,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co switch (reg_size) { case 4: - pref = 2 | EX86_SSE2; + op = EX86_SSE2; break; case 5: if (!(cpu_feature_list & CPU_FEATURE_AVX2)) return SLJIT_ERR_UNSUPPORTED; - pref = EX86_SSE2 | VEX_256; + op = EX86_SSE2 | VEX_256; break; default: return SLJIT_ERR_UNSUPPORTED; @@ -3531,29 +3551,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co if (type & SLJIT_SIMD_FLOAT) { if (elem_size == 2 || elem_size == 3) { - opcode = alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm; + op |= alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm; if (elem_size == 3) - pref |= EX86_PREF_66; + op |= EX86_PREF_66; if (type & SLJIT_SIMD_STORE) - opcode = U8(opcode + 1); - } + op += 1; + } else + return SLJIT_ERR_UNSUPPORTED; } else { - opcode = (type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm; - pref |= alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3; + op |= ((type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm) + | (alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3); } - if (opcode == 0) - return SLJIT_ERR_UNSUPPORTED; - if (type & SLJIT_SIMD_TEST) return SLJIT_SUCCESS; - if (pref & VEX_256) - return emit_vex_instruction(compiler, opcode | pref, freg, 0, srcdst, srcdstw); + if (op & VEX_256) + return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw); - return emit_groupf(compiler, opcode, pref, freg, srcdst, srcdstw); + return emit_groupf(compiler, op, freg, srcdst, srcdstw); } SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type, @@ -3598,7 +3616,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (elem_size >= 3) compiler->mode32 = 0; #endif /* SLJIT_CONFIG_X86_64 */ - FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw)); + FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = 1; #endif /* SLJIT_CONFIG_X86_64 */ @@ -3641,7 +3659,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (reg_size == 5) return emit_vex_instruction(compiler, XORPD_x_xm | VEX_256 | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); - return emit_groupf(compiler, XORPD_x_xm, (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0); + return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0); } if (elem_size == 2 && freg != src) { @@ -3650,7 +3668,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil srcw = 0; } - FAIL_IF(emit_groupf(compiler, elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm, (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw)); + FAIL_IF(emit_groupf(compiler, (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw)); if (elem_size == 2) return emit_byte(compiler, 0); @@ -3676,9 +3694,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil if (srcw == 0 || srcw == -1) { if (reg_size == 5) - return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); + return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0); - return emit_groupf(compiler, srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0); + return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0); } #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) @@ -3741,19 +3759,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil switch (elem_size) { case 0: - FAIL_IF(emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); - return emit_groupf_ext(compiler, PSHUFB_x_xm, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); + FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); + return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); case 1: - FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, EX86_PREF_F2 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0)); FAIL_IF(emit_byte(compiler, 0)); /* fallthrough */ default: - FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); return emit_byte(compiler, 0); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) case 3: compiler->mode32 = 1; - FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0)); return emit_byte(compiler, 0x44); #endif /* SLJIT_CONFIG_X86_64 */ } @@ -3835,18 +3853,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (elem_size == 2) { if (reg_size == 4) - return emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw); + return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw); return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw); } } else if (srcdst & SLJIT_MEM) { SLJIT_ASSERT(elem_size == 2 || elem_size == 3); if (reg_size == 4) - return emit_groupf(compiler, MOVSD_x_xm, (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw); + return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw); return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw); } else if (elem_size == 3) { if (reg_size == 4) - return emit_groupf(compiler, MOVQ_x_xm, EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0); + return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0); return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0); } } @@ -3860,12 +3878,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile srcdstw = 0; } - size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0); + size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0) + | ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2; if (reg_size == 5) - FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | VEX_256 | size | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0)); + FAIL_IF(emit_vex_instruction(compiler, size | VEX_256 | VEX_SSE2_OPV, freg, freg, freg, 0)); else - FAIL_IF(emit_groupf(compiler, (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm, size | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, size, freg, freg, 0)); } else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) { FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0)); FAIL_IF(emit_byte(compiler, 1)); @@ -3878,20 +3897,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (elem_size == 3) { if (srcdst & SLJIT_MEM) { if (type & SLJIT_SIMD_STORE) - opcode = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x; + size = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x; else - opcode = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m; + size = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m; - FAIL_IF(emit_groupf(compiler, opcode, EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_groupf(compiler, size | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw)); /* In case of store, freg is not TMP_FREG. */ } else if (type & SLJIT_SIMD_STORE) { if (lane_index == 1) - return emit_groupf(compiler, MOVHLPS_x_x, EX86_SSE2, srcdst, freg, 0); + return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0); return emit_sse2_load(compiler, 0, srcdst, freg, 0); } else { if (lane_index == 1) - FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x, EX86_SSE2, freg, srcdst, 0)); + FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0)); else FAIL_IF(emit_sse2_store(compiler, 0, freg, 0, srcdst)); } @@ -3900,13 +3919,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg); if (srcdst & SLJIT_MEM) { - FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); return emit_byte(compiler, U8(lane_index)); } - size = EX86_SSE2; if (srcdst == freg) - opcode = SHUFPS_x_xm; + size = SHUFPS_x_xm | EX86_SSE2; else { if (cpu_feature_list & CPU_FEATURE_AVX) { FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0)); @@ -3915,29 +3933,28 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile switch (lane_index) { case 1: - opcode = MOVSHDUP_x_xm; - size = EX86_PREF_F3 | EX86_SSE2; + size = MOVSHDUP_x_xm | EX86_PREF_F3 | EX86_SSE2; break; case 2: - opcode = MOVHLPS_x_x; + size = MOVHLPS_x_x | EX86_SSE2; break; default: SLJIT_ASSERT(lane_index == 3); - opcode = PSHUFD_x_xm; - size = EX86_PREF_66 | EX86_SSE2; + size = PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2; break; } } - FAIL_IF(emit_groupf(compiler, opcode, size, srcdst, freg, 0)); + FAIL_IF(emit_groupf(compiler, size, srcdst, freg, 0)); - if (opcode == SHUFPS_x_xm || opcode == PSHUFD_x_xm) + size &= 0xff; + if (size == SHUFPS_x_xm || size == PSHUFD_x_xm) return emit_byte(compiler, U8(lane_index)); return SLJIT_SUCCESS; } else { if (lane_index != 0 || (srcdst & SLJIT_MEM)) { - FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); + FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw)); FAIL_IF(emit_byte(compiler, U8(lane_index << 4))); } else FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst)); @@ -4051,7 +4068,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile if (elem_size >= 2) return SLJIT_SUCCESS; - FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16, 0, + FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16, (srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0)); if (srcdst_orig & SLJIT_MEM) @@ -4107,7 +4124,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } if (src_lane_index == 0) - return emit_groupf(compiler, MOVDDUP_x_xm, EX86_PREF_F2 | EX86_SSE2, freg, src, 0); + return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0); /* Changes it to SHUFPD_x_xm. */ pref = EX86_PREF_66; @@ -4137,9 +4154,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0)); } else { if (freg != src) - FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, pref | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0)); - FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm, pref | EX86_SSE2, freg, freg, 0)); + FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0)); } if (elem_size == 2) { @@ -4174,9 +4191,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } else { if (freg == src || !(cpu_feature_list & CPU_FEATURE_AVX2)) { if (freg != src) - FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm, EX86_PREF_66 | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0)); - FAIL_IF(emit_groupf(compiler, PSRLDQ_x, 2 | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0)); + FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0)); } else FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0)); @@ -4184,7 +4201,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c } if (pref != 0) { - FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, pref | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0)); FAIL_IF(emit_byte(compiler, byte)); } @@ -4195,8 +4212,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0); SLJIT_ASSERT(reg_size == 4); - FAIL_IF(emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); - return emit_groupf_ext(compiler, PSHUFB_x_xm, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); + FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0)); + return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0); } if ((cpu_feature_list & CPU_FEATURE_AVX2) && src_lane_index == 0 && elem_size <= 3) { @@ -4261,7 +4278,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c src_lane_index >>= 1; pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3; - FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, pref | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0)); byte = U8(byte | (byte << 2)); FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4)))); @@ -4280,7 +4297,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c break; } - FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, src, 0)); + FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0)); return emit_byte(compiler, U8(byte | (byte << 4))); } @@ -4316,7 +4333,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler return SLJIT_SUCCESS; if (reg_size == 4) - return emit_groupf(compiler, CVTPS2PD_x_xm, EX86_SSE2, freg, src, srcw); + return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw); return emit_vex_instruction(compiler, CVTPS2PD_x_xm | VEX_256 | EX86_SSE2, freg, 0, src, srcw); } @@ -4353,7 +4370,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler return SLJIT_SUCCESS; if (reg_size == 4) - return emit_groupf_ext(compiler, opcode, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw); + return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw); return emit_vex_instruction(compiler, opcode | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw); } @@ -4388,7 +4405,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c switch (elem_size) { case 1: - FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0)); + FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0)); freg = TMP_FREG; break; case 2: @@ -4397,7 +4414,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c } dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1; - FAIL_IF(emit_groupf(compiler, elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x, pref, dst_r, freg, 0)); + FAIL_IF(emit_groupf(compiler, (elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x) | pref, dst_r, freg, 0)); #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) compiler->mode32 = type & SLJIT_32; @@ -4427,7 +4444,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0)); FAIL_IF(emit_byte(compiler, 1)); FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0)); - FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x, EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0)); + FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0)); } else { pref = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2; @@ -4449,6 +4466,87 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c return SLJIT_SUCCESS; } +static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src_freg) +{ + sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2; + + SLJIT_ASSERT(SLJIT_SIMD_GET_REG_SIZE(type) == 4); + + if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3) + op |= EX86_PREF_66; + + return emit_groupf(compiler, op, dst_freg, src_freg, 0); +} + +SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, + sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg) +{ + sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type); + sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type); + sljit_s32 needs_move = 0; + sljit_uw op = 0; + + CHECK_ERROR(); + CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg)); + +#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) + compiler->mode32 = 1; +#endif /* SLJIT_CONFIG_X86_64 */ + + if (reg_size == 5) { + if (!(cpu_feature_list & CPU_FEATURE_AVX2)) + return SLJIT_ERR_UNSUPPORTED; + } else if (reg_size != 4) + return SLJIT_ERR_UNSUPPORTED; + + if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3)) + return SLJIT_ERR_UNSUPPORTED; + + switch (SLJIT_SIMD_GET_OPCODE(type)) { + case SLJIT_SIMD_OP2_AND: + op = (type & SLJIT_SIMD_FLOAT) ? ANDPD_x_xm : PAND_x_xm; + + if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) + op |= EX86_PREF_66; + break; + case SLJIT_SIMD_OP2_OR: + op = (type & SLJIT_SIMD_FLOAT) ? ORPD_x_xm : POR_x_xm; + + if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) + op |= EX86_PREF_66; + break; + case SLJIT_SIMD_OP2_XOR: + op = (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm; + + if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3) + op |= EX86_PREF_66; + break; + } + + if (type & SLJIT_SIMD_TEST) + return SLJIT_SUCCESS; + + needs_move = dst_freg != src1_freg && dst_freg != src2_freg; + + if (reg_size == 5 || (needs_move && (cpu_feature_list & CPU_FEATURE_AVX2))) { + if (reg_size == 5) + op |= VEX_256; + + return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0); + } + + if (needs_move) { + FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg)); + } else if (dst_freg != src1_freg) { + SLJIT_ASSERT(dst_freg == src2_freg); + src2_freg = src1_freg; + } + + FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0)); + return SLJIT_SUCCESS; +} + SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op, sljit_s32 dst_reg, sljit_s32 mem_reg) @@ -4541,7 +4639,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler pref = EX86_REX; #endif /* SLJIT_CONFIG_X86_64 */ - FAIL_IF(emit_groupf(compiler, U8(op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r), pref, src_reg, SLJIT_MEM1(mem_reg), 0)); + FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0)); if (temp_reg != SLJIT_R0) { #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) |