aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZoltan Herczeg <hzmester@freemail.hu>2023-12-06 12:17:20 +0000
committerZoltan Herczeg <hzmester@freemail.hu>2023-12-06 12:17:20 +0000
commit4e8fdb3e06d015c998f69ad3ea4f826b836aea0f (patch)
tree839f0a0e9004bcdadf7c04b06dd06ae7535d716e
parentc3529d0227edcb614eec195eff309d4eb914dce5 (diff)
downloadpcre-4e8fdb3e06d015c998f69ad3ea4f826b836aea0f.tar.gz
JIT compiler update
-rw-r--r--src/pcre2_jit_simd_inc.h31
-rw-r--r--src/sljit/sljitConfigInternal.h112
-rw-r--r--src/sljit/sljitLir.c124
-rw-r--r--src/sljit/sljitLir.h28
-rw-r--r--src/sljit/sljitNativeARM_32.c104
-rw-r--r--src/sljit/sljitNativeARM_64.c269
-rw-r--r--src/sljit/sljitNativeARM_T2_32.c102
-rw-r--r--src/sljit/sljitNativeLOONGARCH_64.c21
-rw-r--r--src/sljit/sljitNativeMIPS_common.c33
-rw-r--r--src/sljit/sljitNativeRISCV_32.c1
-rw-r--r--src/sljit/sljitNativeRISCV_64.c4
-rw-r--r--src/sljit/sljitNativeRISCV_common.c12
-rw-r--r--src/sljit/sljitNativeS390X.c481
-rw-r--r--src/sljit/sljitNativeX86_32.c44
-rw-r--r--src/sljit/sljitNativeX86_64.c16
-rw-r--r--src/sljit/sljitNativeX86_common.c366
16 files changed, 1293 insertions, 455 deletions
diff --git a/src/pcre2_jit_simd_inc.h b/src/pcre2_jit_simd_inc.h
index c178d320..26d5e2e0 100644
--- a/src/pcre2_jit_simd_inc.h
+++ b/src/pcre2_jit_simd_inc.h
@@ -483,11 +483,7 @@ sljit_s32 cmp2a_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR3);
sljit_s32 cmp1b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR4);
sljit_s32 cmp2b_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR5);
sljit_s32 tmp1_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_FR6);
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-sljit_s32 tmp2_ind = 0;
-#else /* !SLJIT_CONFIG_X86_32 */
-sljit_s32 tmp2_ind = 4;
-#endif /* SLJIT_CONFIG_X86_32 */
+sljit_s32 tmp2_ind = sljit_get_register_index(SLJIT_FLOAT_REGISTER, SLJIT_TMP_FR0);
struct sljit_label *start;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_label *restart;
@@ -660,19 +656,7 @@ for (i = 0; i < 4; i++)
fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
}
-/* PAND xmm1, xmm2/m128 */
-if (reg_type == SLJIT_SIMD_REG_256)
- {
- instruction[0] = 0xc5;
- instruction[1] = (sljit_u8)(0xfd ^ (data1_ind << 3));
- }
-
-/* instruction[0] = 0x66 / 0xc5; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xdb;
-instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
+sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
/* Ignore matches before the first STR_PTR. */
@@ -700,16 +684,7 @@ for (i = 0; i < 4; i++)
fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
}
-/* PAND xmm1, xmm2/m128 */
-if (reg_type == SLJIT_SIMD_REG_256)
- instruction[1] = (sljit_u8)(0xfd ^ (data1_ind << 3));
-
-/* instruction[0] = 0x66 / 0xc5; */
-/* instruction[1] = 0x0f; */
-instruction[2] = 0xdb;
-instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
-sljit_emit_op_custom(compiler, instruction, 4);
-
+sljit_emit_simd_op2(compiler, SLJIT_SIMD_OP2_AND | reg_type, SLJIT_FR0, SLJIT_FR0, SLJIT_FR1);
sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
diff --git a/src/sljit/sljitConfigInternal.h b/src/sljit/sljitConfigInternal.h
index 3d0e3da4..d224248c 100644
--- a/src/sljit/sljitConfigInternal.h
+++ b/src/sljit/sljitConfigInternal.h
@@ -72,6 +72,8 @@ extern "C" {
SLJIT_NUMBER_OF_FLOAT_REGISTERS : number of available floating point registers
SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS : number of available floating point scratch registers
SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS : number of available floating point saved registers
+ SLJIT_NUMBER_OF_TEMPORARY_REGISTERS : number of available temporary registers
+ SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS : number of available temporary floating point registers
SLJIT_WORD_SHIFT : the shift required to apply when accessing a sljit_sw/sljit_uw array by index
SLJIT_F32_SHIFT : the shift required to apply when accessing
a single precision floating point array by index
@@ -81,8 +83,21 @@ extern "C" {
the scratch register index of ecx is stored in this variable
SLJIT_LOCALS_OFFSET : local space starting offset (SLJIT_SP + SLJIT_LOCALS_OFFSET)
SLJIT_RETURN_ADDRESS_OFFSET : a return instruction always adds this offset to the return address
+ SLJIT_CONV_MAX_FLOAT : result when a floating point value is converted to integer
+ and the floating point value is higher than the maximum integer value
+ (possible values: SLJIT_CONV_RESULT_MAX_INT or SLJIT_CONV_RESULT_MIN_INT)
+ SLJIT_CONV_MIN_FLOAT : result when a floating point value is converted to integer
+ and the floating point value is lower than the minimum integer value
+ (possible values: SLJIT_CONV_RESULT_MAX_INT or SLJIT_CONV_RESULT_MIN_INT)
+ SLJIT_CONV_NAN_FLOAT : result when a NaN floating point value is converted to integer
+ (possible values: SLJIT_CONV_RESULT_MAX_INT, SLJIT_CONV_RESULT_MIN_INT,
+ or SLJIT_CONV_RESULT_ZERO)
Other macros:
+ SLJIT_TMP_R0 .. R9 : accessing temporary registers
+ SLJIT_TMP_R(i) : accessing temporary registers
+ SLJIT_TMP_FR0 .. FR9 : accessing temporary floating point registers
+ SLJIT_TMP_FR(i) : accessing temporary floating point registers
SLJIT_FUNC : calling convention attribute for both calling JIT from C and C calling back from JIT
SLJIT_W(number) : defining 64 bit constants on 64 bit architectures (platform independent helper)
SLJIT_F64_SECOND(reg) : provides the register index of the second 32 bit part of a 64 bit
@@ -356,6 +371,38 @@ typedef double sljit_f64;
#define SLJIT_F32_SHIFT 2
#define SLJIT_F64_SHIFT 3
+#define SLJIT_CONV_RESULT_MAX_INT 0
+#define SLJIT_CONV_RESULT_MIN_INT 1
+#define SLJIT_CONV_RESULT_ZERO 2
+
+#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
+#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#elif (defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
+#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_ZERO
+#elif (defined SLJIT_CONFIG_MIPS && SLJIT_CONFIG_MIPS)
+#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#elif (defined SLJIT_CONFIG_PPC && SLJIT_CONFIG_PPC)
+#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#elif (defined SLJIT_CONFIG_RISCV && SLJIT_CONFIG_RISCV)
+#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#elif (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
+#define SLJIT_CONV_MAX_FLOAT SLJIT_CONV_RESULT_MAX_INT
+#define SLJIT_CONV_MIN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#define SLJIT_CONV_NAN_FLOAT SLJIT_CONV_RESULT_MIN_INT
+#else
+#error "Result for float to integer conversion is not defined"
+#endif
+
#ifndef SLJIT_W
/* Defining long constants. */
@@ -528,8 +575,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 12
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 7
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 1
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 7
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 1
#define SLJIT_LOCALS_OFFSET_BASE (8 * SSIZE_OF(sw))
#define SLJIT_PREF_SHIFT_REG SLJIT_R2
#define SLJIT_MASKED_SHIFT 1
@@ -538,7 +587,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#elif (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
#define SLJIT_NUMBER_OF_REGISTERS 13
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 2
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 15
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 1
#ifndef _WIN64
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 6
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0
@@ -556,16 +607,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 12
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 8
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 2
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 14
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2
#define SLJIT_LOCALS_OFFSET_BASE 0
#elif (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64)
#define SLJIT_NUMBER_OF_REGISTERS 26
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 10
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 3
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2
#define SLJIT_LOCALS_OFFSET_BASE (2 * (sljit_s32)sizeof(sljit_sw))
#define SLJIT_MASKED_SHIFT 1
#define SLJIT_MASKED_SHIFT32 1
@@ -574,8 +629,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 23
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 17
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 3
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 18
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2
#if (defined SLJIT_CONFIG_PPC_64 && SLJIT_CONFIG_PPC_64) || (defined _AIX)
#define SLJIT_LOCALS_OFFSET_BASE ((6 + 8) * (sljit_s32)sizeof(sljit_sw))
#elif (defined SLJIT_CONFIG_PPC_32 && SLJIT_CONFIG_PPC_32)
@@ -598,6 +655,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 29
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
#endif
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 5
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 3
#define SLJIT_MASKED_SHIFT 1
#define SLJIT_MASKED_SHIFT32 1
@@ -605,9 +664,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 23
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 12
-#define SLJIT_LOCALS_OFFSET_BASE 0
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 5
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2
+#define SLJIT_LOCALS_OFFSET_BASE 0
#define SLJIT_MASKED_SHIFT 1
#define SLJIT_MASKED_SHIFT32 1
@@ -636,8 +697,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 12
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 8
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 3
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 15
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 8
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 1
#define SLJIT_LOCALS_OFFSET_BASE SLJIT_S390X_DEFAULT_STACK_FRAME_SIZE
#define SLJIT_MASKED_SHIFT 1
@@ -645,9 +708,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_REGISTERS 23
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 10
-#define SLJIT_LOCALS_OFFSET_BASE 0
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 5
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 30
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 12
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 2
+#define SLJIT_LOCALS_OFFSET_BASE 0
#define SLJIT_MASKED_SHIFT 1
#define SLJIT_MASKED_SHIFT32 1
@@ -656,8 +721,10 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
/* Just to have something. */
#define SLJIT_NUMBER_OF_REGISTERS 0
#define SLJIT_NUMBER_OF_SAVED_REGISTERS 0
+#define SLJIT_NUMBER_OF_TEMPORARY_REGISTERS 0
#define SLJIT_NUMBER_OF_FLOAT_REGISTERS 0
#define SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS 0
+#define SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS 0
#define SLJIT_LOCALS_OFFSET_BASE 0
#endif
@@ -670,6 +737,45 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#define SLJIT_NUMBER_OF_SCRATCH_FLOAT_REGISTERS \
(SLJIT_NUMBER_OF_FLOAT_REGISTERS - SLJIT_NUMBER_OF_SAVED_FLOAT_REGISTERS)
+/**********************************/
+/* Temporary register management. */
+/**********************************/
+
+#define SLJIT_TMP_REGISTER_BASE (SLJIT_NUMBER_OF_REGISTERS + 2)
+#define SLJIT_TMP_FREGISTER_BASE (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+
+/* WARNING: Accessing temporary registers is not recommended, because they
+ are also used by the JIT compiler for various computations. Using them
+ might have any side effects including incorrect operations and crashes,
+ so use them at your own risk. The machine registers themselves might have
+ limitations, e.g. the r0 register on s390x / ppc cannot be used as
+ base address for memory operations. */
+
+/* Temporary registers */
+#define SLJIT_TMP_R0 (SLJIT_TMP_REGISTER_BASE + 0)
+#define SLJIT_TMP_R1 (SLJIT_TMP_REGISTER_BASE + 1)
+#define SLJIT_TMP_R2 (SLJIT_TMP_REGISTER_BASE + 2)
+#define SLJIT_TMP_R3 (SLJIT_TMP_REGISTER_BASE + 3)
+#define SLJIT_TMP_R4 (SLJIT_TMP_REGISTER_BASE + 4)
+#define SLJIT_TMP_R5 (SLJIT_TMP_REGISTER_BASE + 5)
+#define SLJIT_TMP_R6 (SLJIT_TMP_REGISTER_BASE + 6)
+#define SLJIT_TMP_R7 (SLJIT_TMP_REGISTER_BASE + 7)
+#define SLJIT_TMP_R8 (SLJIT_TMP_REGISTER_BASE + 8)
+#define SLJIT_TMP_R9 (SLJIT_TMP_REGISTER_BASE + 9)
+#define SLJIT_TMP_R(i) (SLJIT_TMP_REGISTER_BASE + (i))
+
+#define SLJIT_TMP_FR0 (SLJIT_TMP_FREGISTER_BASE + 0)
+#define SLJIT_TMP_FR1 (SLJIT_TMP_FREGISTER_BASE + 1)
+#define SLJIT_TMP_FR2 (SLJIT_TMP_FREGISTER_BASE + 2)
+#define SLJIT_TMP_FR3 (SLJIT_TMP_FREGISTER_BASE + 3)
+#define SLJIT_TMP_FR4 (SLJIT_TMP_FREGISTER_BASE + 4)
+#define SLJIT_TMP_FR5 (SLJIT_TMP_FREGISTER_BASE + 5)
+#define SLJIT_TMP_FR6 (SLJIT_TMP_FREGISTER_BASE + 6)
+#define SLJIT_TMP_FR7 (SLJIT_TMP_FREGISTER_BASE + 7)
+#define SLJIT_TMP_FR8 (SLJIT_TMP_FREGISTER_BASE + 8)
+#define SLJIT_TMP_FR9 (SLJIT_TMP_FREGISTER_BASE + 9)
+#define SLJIT_TMP_FR(i) (SLJIT_TMP_FREGISTER_BASE + (i))
+
/********************************/
/* CPU status flags management. */
/********************************/
@@ -690,7 +796,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_sw sljit_exec_offset(void* ptr);
#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
|| (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
#define SLJIT_F64_SECOND(reg) \
- ((reg) + SLJIT_FS0)
+ ((reg) + SLJIT_FS0 + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)
#else /* !SLJIT_CONFIG_ARM_32 && !SLJIT_CONFIG_MIPS_32 */
#define SLJIT_F64_SECOND(reg) \
(reg)
diff --git a/src/sljit/sljitLir.c b/src/sljit/sljitLir.c
index 564eb736..6f193000 100644
--- a/src/sljit/sljitLir.c
+++ b/src/sljit/sljitLir.c
@@ -142,11 +142,15 @@
#define SLJIT_KEPT_SAVEDS_COUNT(options) ((options) & 0x3)
/* Getters for simd operations, which returns with log2(size). */
+#define SLJIT_SIMD_GET_OPCODE(type) ((type) & 0xff)
#define SLJIT_SIMD_GET_REG_SIZE(type) (((type) >> 12) & 0x3f)
#define SLJIT_SIMD_GET_ELEM_SIZE(type) (((type) >> 18) & 0x3f)
-#define SLJIT_SIMD_GET_ALIGNMENT(type) (((type) >> 24) & 0x3f)
#define SLJIT_SIMD_GET_ELEM2_SIZE(type) (((type) >> 24) & 0x3f)
+#define SLJIT_SIMD_CHECK_REG(type) (((type) & 0x3f000) >= SLJIT_SIMD_REG_64 && ((type) & 0x3f000) <= SLJIT_SIMD_REG_512)
+#define SLJIT_SIMD_TYPE_MASK(m) ((sljit_s32)0xff000fff & ~(SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | (m)))
+#define SLJIT_SIMD_TYPE_MASK2(m) ((sljit_s32)0xc0000fff & ~(SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | (m)))
+
/* Jump flags. */
#define JUMP_LABEL 0x1
#define JUMP_ADDR 0x2
@@ -844,7 +848,8 @@ static sljit_s32 function_check_arguments(sljit_s32 arg_types, sljit_s32 scratch
#define FUNCTION_CHECK_IS_REG(r) \
(((r) >= SLJIT_R0 && (r) < (SLJIT_R0 + compiler->scratches)) \
- || ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0))
+ || ((r) > (SLJIT_S0 - compiler->saveds) && (r) <= SLJIT_S0) \
+ || ((r) >= SLJIT_TMP_REGISTER_BASE && (r) < (SLJIT_TMP_REGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_REGISTERS)))
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
#define CHECK_IF_VIRTUAL_REGISTER(p) ((p) <= SLJIT_S3 && (p) >= SLJIT_S8)
@@ -952,7 +957,8 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s
return 0;
return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
- || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+ || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0)
+ || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS));
}
#define FUNCTION_FCHECK(p, i, is_32) \
@@ -964,7 +970,8 @@ static sljit_s32 function_fcheck(struct sljit_compiler *compiler, sljit_s32 p, s
return 0;
if ((p >= SLJIT_FR0 && p < (SLJIT_FR0 + compiler->fscratches))
- || (p > (SLJIT_FS0 - compiler->fsaveds) && p <= SLJIT_FS0))
+ || (p > (SLJIT_FS0 - compiler->fsaveds) && p <= SLJIT_FS0)
+ || (p >= SLJIT_TMP_FREGISTER_BASE && p < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)))
return (i == 0);
return function_check_src_mem(compiler, p, i);
@@ -999,26 +1006,30 @@ static void sljit_verbose_reg(struct sljit_compiler *compiler, sljit_s32 r)
{
if (r < (SLJIT_R0 + compiler->scratches))
fprintf(compiler->verbose, "r%d", r - SLJIT_R0);
- else if (r != SLJIT_SP)
+ else if (r < SLJIT_SP)
fprintf(compiler->verbose, "s%d", SLJIT_NUMBER_OF_REGISTERS - r);
- else
+ else if (r == SLJIT_SP)
fprintf(compiler->verbose, "sp");
+ else
+ fprintf(compiler->verbose, "t%d", r - SLJIT_TMP_REGISTER_BASE);
}
static void sljit_verbose_freg(struct sljit_compiler *compiler, sljit_s32 r)
{
#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) \
|| (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
- if (r >= (SLJIT_FS0 + SLJIT_FR0) && r <= (SLJIT_FS0 + SLJIT_FS0)) {
+ if (r >= SLJIT_F64_SECOND(SLJIT_FR0)) {
fprintf(compiler->verbose, "^");
- r -= SLJIT_FS0;
+ r -= SLJIT_F64_SECOND(0);
}
#endif /* SLJIT_CONFIG_ARM_32 || SLJIT_CONFIG_MIPS_32 */
if (r < (SLJIT_FR0 + compiler->fscratches))
fprintf(compiler->verbose, "fr%d", r - SLJIT_FR0);
- else
+ else if (r < SLJIT_TMP_FREGISTER_BASE)
fprintf(compiler->verbose, "fs%d", SLJIT_NUMBER_OF_FLOAT_REGISTERS - r);
+ else
+ fprintf(compiler->verbose, "ft%d", r - SLJIT_TMP_FREGISTER_BASE);
}
static void sljit_verbose_param(struct sljit_compiler *compiler, sljit_s32 p, sljit_sw i)
@@ -1121,6 +1132,10 @@ static const char* fop2r_names[] = {
"copysign"
};
+static const char* simd_op2_names[] = {
+ "and", "or", "xor"
+};
+
static const char* jump_names[] = {
"equal", "not_equal",
"less", "greater_equal",
@@ -1705,10 +1720,12 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_register_index(sljit_s32 t
SLJIT_UNUSED_ARG(reg);
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
if (type == SLJIT_GP_REGISTER) {
- CHECK_ARGUMENT(reg > 0 && reg <= SLJIT_NUMBER_OF_REGISTERS);
+ CHECK_ARGUMENT((reg > 0 && reg <= SLJIT_NUMBER_OF_REGISTERS)
+ || (reg >= SLJIT_TMP_REGISTER_BASE && reg <= (SLJIT_TMP_REGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_REGISTERS)));
} else {
CHECK_ARGUMENT(type == SLJIT_FLOAT_REGISTER || ((type >> 12) == 0 || ((type >> 12) >= 3 && (type >> 12) <= 6)));
- CHECK_ARGUMENT(reg > 0 && reg <= SLJIT_NUMBER_OF_FLOAT_REGISTERS);
+ CHECK_ARGUMENT((reg > 0 && reg <= SLJIT_NUMBER_OF_FLOAT_REGISTERS)
+ || (reg >= SLJIT_TMP_FREGISTER_BASE && reg <= (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS)));
}
#endif
CHECK_RETURN_OK;
@@ -1980,7 +1997,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_fset64(struct sljit_compi
if (SLJIT_UNLIKELY(!!compiler->verbose)) {
fprintf(compiler->verbose, " fset64 ");
sljit_verbose_freg(compiler, freg);
- fprintf(compiler->verbose, ", %lf\n", value);
+ fprintf(compiler->verbose, ", %f\n", value);
}
#endif
CHECK_RETURN_OK;
@@ -2590,10 +2607,10 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
- CHECK_ARGUMENT((type & (sljit_s32)(0xc0000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
- CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(SLJIT_SIMD_STORE)) == 0);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type));
- CHECK_ARGUMENT(SLJIT_SIMD_GET_ALIGNMENT(type) <= (srcdst & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0);
+ CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) <= (srcdst & SLJIT_MEM) ? SLJIT_SIMD_GET_REG_SIZE(type) : 0);
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
FUNCTION_FCHECK(srcdst, srcdstw, 0);
#endif
@@ -2615,7 +2632,7 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_mov(struct sljit_com
if ((type & 0x3f000000) == SLJIT_SIMD_MEM_UNALIGNED)
fprintf(compiler->verbose, ".unal ");
else
- fprintf(compiler->verbose, ".al%d ", (8 << SLJIT_SIMD_GET_ALIGNMENT(type)));
+ fprintf(compiler->verbose, ".al%d ", (8 << SLJIT_SIMD_GET_ELEM2_SIZE(type)));
sljit_verbose_freg(compiler, freg);
fprintf(compiler->verbose, ", ");
@@ -2632,8 +2649,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_replicate(struct slj
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
- CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
- CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) == 0);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
@@ -2679,11 +2696,11 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_mov(struct slji
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
- CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO | SLJIT_SIMD_LANE_SIGNED | SLJIT_32 | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO | SLJIT_SIMD_LANE_SIGNED | SLJIT_32)) == 0);
CHECK_ARGUMENT((type & (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO)) != (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_ZERO));
CHECK_ARGUMENT((type & (SLJIT_SIMD_STORE | SLJIT_SIMD_LANE_SIGNED)) != SLJIT_SIMD_LANE_SIGNED);
CHECK_ARGUMENT(!(type & SLJIT_SIMD_FLOAT) || !(type & (SLJIT_SIMD_LANE_SIGNED | SLJIT_32)));
- CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
CHECK_ARGUMENT(!(type & SLJIT_32) || SLJIT_SIMD_GET_ELEM_SIZE(type) <= 2);
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
@@ -2731,8 +2748,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_lane_replicate(struc
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
- CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
- CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) == 0);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src, 0));
@@ -2767,9 +2784,9 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_extend(struct sljit_
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
- CHECK_ARGUMENT((type & (sljit_s32)(0xc0000fff - (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST))) == 0);
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK2(SLJIT_SIMD_EXTEND_SIGNED)) == 0);
CHECK_ARGUMENT((type & (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT)) != (SLJIT_SIMD_EXTEND_SIGNED | SLJIT_SIMD_FLOAT));
- CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM2_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_ELEM2_SIZE(type));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
@@ -2807,8 +2824,8 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co
{
#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
- CHECK_ARGUMENT((type & (sljit_s32)(0xff000fff - (SLJIT_SIMD_FLOAT | SLJIT_SIMD_TEST | SLJIT_32))) == SLJIT_SIMD_STORE);
- CHECK_ARGUMENT((type & 0x3f000) >= SLJIT_SIMD_REG_64 && (type & 0x3f000) <= SLJIT_SIMD_REG_512);
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(SLJIT_32)) == SLJIT_SIMD_STORE);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) < SLJIT_SIMD_GET_REG_SIZE(type));
CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(freg, 0));
FUNCTION_CHECK_DST(dst, dstw);
@@ -2837,6 +2854,44 @@ static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_sign(struct sljit_co
CHECK_RETURN_OK;
}
+static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
+ CHECK_ARGUMENT(sljit_has_cpu_feature(SLJIT_HAS_SIMD));
+ CHECK_ARGUMENT((type & SLJIT_SIMD_TYPE_MASK(0)) >= SLJIT_SIMD_OP2_AND && (type & SLJIT_SIMD_TYPE_MASK(0)) <= SLJIT_SIMD_OP2_XOR);
+ CHECK_ARGUMENT(SLJIT_SIMD_CHECK_REG(type));
+ CHECK_ARGUMENT(SLJIT_SIMD_GET_ELEM_SIZE(type) <= SLJIT_SIMD_GET_REG_SIZE(type));
+ CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(dst_freg, 0));
+ CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src1_freg, 0));
+ CHECK_ARGUMENT(FUNCTION_CHECK_IS_FREG(src2_freg, 0));
+#endif
+#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
+ if (SLJIT_UNLIKELY(!!compiler->verbose)) {
+ if (type & SLJIT_SIMD_TEST)
+ CHECK_RETURN_OK;
+ if (sljit_emit_simd_op2(compiler, type | SLJIT_SIMD_TEST, dst_freg, src1_freg, src2_freg) == SLJIT_ERR_UNSUPPORTED) {
+ fprintf(compiler->verbose, " # simd_op2: unsupported form, no instructions are emitted\n");
+ CHECK_RETURN_OK;
+ }
+
+ fprintf(compiler->verbose, " simd_%s.%d.%s%d ",
+ simd_op2_names[SLJIT_SIMD_GET_OPCODE(type) - 1],
+ (8 << SLJIT_SIMD_GET_REG_SIZE(type)),
+ (type & SLJIT_SIMD_FLOAT) ? "f" : "",
+ (8 << SLJIT_SIMD_GET_ELEM_SIZE(type)));
+
+ sljit_verbose_freg(compiler, dst_freg);
+ fprintf(compiler->verbose, ", ");
+ sljit_verbose_freg(compiler, src1_freg);
+ fprintf(compiler->verbose, ", ");
+ sljit_verbose_freg(compiler, src2_freg);
+ fprintf(compiler->verbose, "\n");
+ }
+#endif
+ CHECK_RETURN_OK;
+}
+
static SLJIT_INLINE CHECK_RETURN_TYPE check_sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
{
/* Any offset is allowed. */
@@ -3232,7 +3287,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler
#endif /* !SLJIT_CONFIG_ARM_64 && !SLJIT_CONFIG_PPC */
#if !(defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
- && !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM)
+ && !(defined SLJIT_CONFIG_ARM && SLJIT_CONFIG_ARM) \
+ && !(defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
sljit_s32 freg,
@@ -3325,6 +3381,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
return SLJIT_ERR_UNSUPPORTED;
}
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
+ SLJIT_UNUSED_ARG(compiler);
+ SLJIT_UNUSED_ARG(type);
+ SLJIT_UNUSED_ARG(dst_freg);
+ SLJIT_UNUSED_ARG(src1_freg);
+ SLJIT_UNUSED_ARG(src2_freg);
+
+ return SLJIT_ERR_UNSUPPORTED;
+}
+
#endif /* !SLJIT_CONFIG_X86 && !SLJIT_CONFIG_ARM */
#if !(defined(SLJIT_CONFIG_X86) && SLJIT_CONFIG_X86) \
diff --git a/src/sljit/sljitLir.h b/src/sljit/sljitLir.h
index 95d59e48..2ba6683c 100644
--- a/src/sljit/sljitLir.h
+++ b/src/sljit/sljitLir.h
@@ -1869,6 +1869,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fmem_update(struct sljit_compiler
#define SLJIT_SIMD_ELEM_64 (3 << 18)
/* Element size is 128 bit long */
#define SLJIT_SIMD_ELEM_128 (4 << 18)
+/* Element size is 256 bit long */
+#define SLJIT_SIMD_ELEM_256 (5 << 18)
/* The following options are used by sljit_emit_simd_mov(). */
@@ -2039,6 +2041,32 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
sljit_s32 freg,
sljit_s32 dst, sljit_sw dstw);
+/* The following options are used by sljit_emit_simd_op2(). */
+
+/* Binary 'and' operation */
+#define SLJIT_SIMD_OP2_AND 0x000001
+/* Binary 'or' operation */
+#define SLJIT_SIMD_OP2_OR 0x000002
+/* Binary 'xor' operation */
+#define SLJIT_SIMD_OP2_XOR 0x000003
+
+/* Perform simd operations using simd registers.
+
+ If the operation is not supported, it returns with
+ SLJIT_ERR_UNSUPPORTED. If SLJIT_SIMD_TEST is passed,
+ it does not emit any instructions.
+
+ type must be a combination of SLJIT_SIMD_* and SLJIT_SIMD_OP2_
+ options except SLJIT_SIMD_LOAD and SLJIT_SIMD_STORE
+ dst_freg is the destination register of the operation
+ src1_freg is the first source register of the operation
+ src1_freg is the second source register of the operation
+
+ Flags: - (does not modify flags) */
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg);
+
/* The sljit_emit_atomic_load and sljit_emit_atomic_store operation pair
can perform an atomic read-modify-write operation. First, an unsigned
value must be loaded from memory using sljit_emit_atomic_load. Then,
diff --git a/src/sljit/sljitNativeARM_32.c b/src/sljit/sljitNativeARM_32.c
index 55c62b79..d44616d8 100644
--- a/src/sljit/sljitNativeARM_32.c
+++ b/src/sljit/sljitNativeARM_32.c
@@ -49,8 +49,8 @@ typedef sljit_u32 sljit_ins;
#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
#define TMP_PC (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_FREG1 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1)
-#define TMP_FREG2 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2)
+#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
/* In ARM instruction words.
Cache lines are usually 32 byte aligned. */
@@ -67,18 +67,20 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15
};
-static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+static const sljit_u8 freg_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = {
0,
0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6,
0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
- 6, 7
+ 7, 6
};
-static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = {
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0
+ 1, 1
};
#define RM(rm) ((sljit_ins)reg_map[rm])
@@ -144,6 +146,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3]
#define UXTH 0xe6ff0070
#define VABS_F32 0xeeb00ac0
#define VADD_F32 0xee300a00
+#define VAND 0xf2000110
#define VCMP_F32 0xeeb40a40
#define VCVT_F32_S32 0xeeb80ac0
#define VCVT_F32_U32 0xeeb80a40
@@ -152,6 +155,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3]
#define VDIV_F32 0xee800a00
#define VDUP 0xee800b10
#define VDUP_s 0xf3b00c00
+#define VEOR 0xf3000110
#define VLD1 0xf4200000
#define VLD1_r 0xf4a00c00
#define VLD1_s 0xf4a00000
@@ -190,11 +194,12 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s
if (compiler->scratches == -1)
return 0;
- if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0))
- fr -= SLJIT_FS0;
+ if (is_32 && fr >= SLJIT_F64_SECOND(SLJIT_FR0))
+ fr -= SLJIT_F64_SECOND(0);
return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
- || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+ || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0)
+ || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS));
}
#endif /* SLJIT_ARGUMENT_CHECKS */
@@ -3776,7 +3781,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
{
sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
- sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type);
+ sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
sljit_ins ins;
CHECK_ERROR();
@@ -3806,7 +3811,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
ins = VD(freg) | VN(srcdst) | VM(srcdst);
if (reg_size == 4)
- ins |= (1 << 6);
+ ins |= (sljit_ins)1 << 6;
return push_inst(compiler, VORR | ins);
}
@@ -3973,7 +3978,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
ins = (sljit_ins)(elem_size << 6);
if (reg_size == 4)
- ins |= 1 << 5;
+ ins |= (sljit_ins)1 << 5;
return push_inst(compiler, VLD1_r | ins | VD(freg) | RN(src) | 0xf);
}
@@ -3983,7 +3988,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
ins = ((sljit_ins)freg_ebit_map[src] << (16 + 2 + 1)) | ((sljit_ins)1 << (16 + 2));
if (reg_size == 4)
- ins |= 1 << 6;
+ ins |= (sljit_ins)1 << 6;
return push_inst(compiler, VDUP_s | ins | VD(freg) | (sljit_ins)freg_map[src]);
}
@@ -4018,7 +4023,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
}
if (reg_size == 4)
- ins |= 1 << 21;
+ ins |= (sljit_ins)1 << 21;
return push_inst(compiler, VDUP | ins | VN(freg) | RD(src));
}
@@ -4064,8 +4069,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
}
if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) {
- FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG1) | VN(freg) | VM(freg)));
- srcdst = TMP_FREG1;
+ FAIL_IF(push_inst(compiler, VORR | ins | VD(TMP_FREG2) | VN(freg) | VM(freg)));
+ srcdst = TMP_FREG2;
srcdstw = 0;
}
}
@@ -4184,7 +4189,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
ins = ((((sljit_ins)src_lane_index << 1) | 1) << (16 + elem_size));
if (reg_size == 4)
- ins |= 1 << 6;
+ ins |= (sljit_ins)1 << 6;
return push_inst(compiler, VDUP_s | ins | VD(freg) | VM(src));
}
@@ -4226,7 +4231,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler
src = simd_get_quad_reg_index(src);
if (!(type & SLJIT_SIMD_FLOAT)) {
- dst_reg = (reg_size == 4) ? freg : TMP_FREG1;
+ dst_reg = (reg_size == 4) ? freg : TMP_FREG2;
do {
FAIL_IF(push_inst(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 24))
@@ -4234,8 +4239,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler
src = dst_reg;
} while (++elem_size < elem2_size);
- if (dst_reg == TMP_FREG1)
- return push_inst(compiler, VORR | VD(freg) | VN(TMP_FREG1) | VM(TMP_FREG1));
+ if (dst_reg == TMP_FREG2)
+ return push_inst(compiler, VORR | VD(freg) | VN(TMP_FREG2) | VM(TMP_FREG2));
return SLJIT_SUCCESS;
}
@@ -4298,30 +4303,30 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
if (reg_size == 4) {
freg = simd_get_quad_reg_index(freg);
- ins |= (1 << 6);
+ ins |= (sljit_ins)1 << 6;
}
- SLJIT_ASSERT((freg_map[TMP_FREG1] & 0x1) == 0);
- FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG1) | VM(freg)));
+ SLJIT_ASSERT((freg_map[TMP_FREG2] & 0x1) == 0);
+ FAIL_IF(push_inst(compiler, ins | VD(TMP_FREG2) | VM(freg)));
if (reg_size == 4 && elem_size > 0)
- FAIL_IF(push_inst(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD(TMP_FREG1) | VM(TMP_FREG1)));
+ FAIL_IF(push_inst(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD(TMP_FREG2) | VM(TMP_FREG2)));
ins = (reg_size == 4 && elem_size == 0) ? (1 << 6) : 0;
while (imms >= 0x100) {
- FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | ((imms & 0xff) << 16) | VD(TMP_FREG1) | VM(TMP_FREG1)));
+ FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | ((imms & 0xff) << 16) | VD(TMP_FREG2) | VM(TMP_FREG2)));
imms >>= 8;
}
- FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | (1 << 7) | (imms << 16) | VD(TMP_FREG1) | VM(TMP_FREG1)));
+ FAIL_IF(push_inst(compiler, VSRA | (1 << 24) | ins | (1 << 7) | (imms << 16) | VD(TMP_FREG2) | VM(TMP_FREG2)));
dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
- FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(dst_r) | VN(TMP_FREG1)));
+ FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(dst_r) | VN(TMP_FREG2)));
if (reg_size == 4 && elem_size == 0) {
- SLJIT_ASSERT(freg_map[TMP_FREG1] + 1 == freg_map[TMP_FREG2]);
- FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(TMP_REG2) | VN(TMP_FREG2)));
+ SLJIT_ASSERT(freg_map[TMP_FREG2] + 1 == freg_map[TMP_FREG1]);
+ FAIL_IF(push_inst(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RD(TMP_REG2) | VN(TMP_FREG1)));
FAIL_IF(push_inst(compiler, ORR | RD(dst_r) | RN(dst_r) | RM(TMP_REG2) | (0x8 << 7)));
}
@@ -4331,6 +4336,47 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
return SLJIT_SUCCESS;
}
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_ins ins = 0;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
+
+ if (reg_size != 3 && reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+ return SLJIT_ERR_UNSUPPORTED;
+
+ switch (SLJIT_SIMD_GET_OPCODE(type)) {
+ case SLJIT_SIMD_OP2_AND:
+ ins = VAND;
+ break;
+ case SLJIT_SIMD_OP2_OR:
+ ins = VORR;
+ break;
+ case SLJIT_SIMD_OP2_XOR:
+ ins = VEOR;
+ break;
+ }
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (reg_size == 4) {
+ dst_freg = simd_get_quad_reg_index(dst_freg);
+ src1_freg = simd_get_quad_reg_index(src1_freg);
+ src2_freg = simd_get_quad_reg_index(src2_freg);
+ ins |= (sljit_ins)1 << 6;
+ }
+
+ return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg));
+}
+
#undef FPU_LOAD
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
diff --git a/src/sljit/sljitNativeARM_64.c b/src/sljit/sljitNativeARM_64.c
index 8194670f..b268582f 100644
--- a/src/sljit/sljitNativeARM_64.c
+++ b/src/sljit/sljitNativeARM_64.c
@@ -67,121 +67,123 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
/* Instrucion forms */
/* --------------------------------------------------------------------- */
-#define ADC 0x9a000000
-#define ADD 0x8b000000
-#define ADDE 0x8b200000
-#define ADDI 0x91000000
-#define AND 0x8a000000
-#define ANDI 0x92000000
-#define ASRV 0x9ac02800
-#define B 0x14000000
-#define B_CC 0x54000000
-#define BL 0x94000000
-#define BLR 0xd63f0000
-#define BR 0xd61f0000
-#define BRK 0xd4200000
-#define CAS 0xc8a07c00
-#define CASB 0x08a07c00
-#define CASH 0x48a07c00
-#define CBZ 0xb4000000
-#define CCMPI 0xfa400800
-#define CLZ 0xdac01000
-#define CSEL 0x9a800000
-#define CSINC 0x9a800400
-#define DUP_e 0x0e000400
-#define DUP_g 0x0e000c00
-#define EOR 0xca000000
-#define EORI 0xd2000000
-#define EXTR 0x93c00000
-#define FABS 0x1e60c000
-#define FADD 0x1e602800
-#define FCMP 0x1e602000
-#define FCSEL 0x1e600c00
-#define FCVT 0x1e224000
-#define FCVTL 0x0e217800
-#define FCVTZS 0x9e780000
-#define FDIV 0x1e601800
-#define FMOV 0x1e604000
-#define FMOV_R 0x9e660000
-#define FMOV_I 0x1e601000
-#define FMUL 0x1e600800
-#define FNEG 0x1e614000
-#define FSUB 0x1e603800
-#define INS 0x4e001c00
-#define INS_e 0x6e000400
-#define LD1 0x0c407000
-#define LD1_s 0x0d400000
-#define LD1R 0x0d40c000
-#define LDRI 0xf9400000
-#define LDRI_F64 0xfd400000
-#define LDRI_POST 0xf8400400
-#define LDP 0xa9400000
-#define LDP_F64 0x6d400000
-#define LDP_POST 0xa8c00000
-#define LDR_PRE 0xf8400c00
-#define LDXR 0xc85f7c00
-#define LDXRB 0x085f7c00
-#define LDXRH 0x485f7c00
-#define LSLV 0x9ac02000
-#define LSRV 0x9ac02400
-#define MADD 0x9b000000
-#define MOVI 0x0f000400
-#define MOVK 0xf2800000
-#define MOVN 0x92800000
-#define MOVZ 0xd2800000
-#define NOP 0xd503201f
-#define ORN 0xaa200000
-#define ORR 0xaa000000
-#define ORR_v 0x0ea01c00
-#define ORRI 0xb2000000
-#define RBIT 0xdac00000
-#define RET 0xd65f0000
-#define REV 0xdac00c00
-#define REV16 0xdac00400
-#define RORV 0x9ac02c00
-#define SBC 0xda000000
-#define SBFM 0x93400000
-#define SCVTF 0x9e620000
-#define SDIV 0x9ac00c00
-#define SMADDL 0x9b200000
-#define SMOV 0x0e002c00
-#define SMULH 0x9b403c00
-#define SSHLL 0x0f00a400
-#define ST1 0x0c007000
-#define ST1_s 0x0d000000
-#define STP 0xa9000000
-#define STP_F64 0x6d000000
-#define STP_PRE 0xa9800000
-#define STRB 0x38206800
-#define STRBI 0x39000000
-#define STRI 0xf9000000
-#define STRI_F64 0xfd000000
-#define STR_FI 0x3d000000
-#define STR_FR 0x3c206800
-#define STUR_FI 0x3c000000
-#define STURBI 0x38000000
-#define STXR 0xc8007c00
-#define STXRB 0x8007c00
-#define STXRH 0x48007c00
-#define SUB 0xcb000000
-#define SUBI 0xd1000000
-#define SUBS 0xeb000000
-#define TBZ 0x36000000
-#define UBFM 0xd3400000
-#define UCVTF 0x9e630000
-#define UDIV 0x9ac00800
-#define UMOV 0x0e003c00
-#define UMULH 0x9bc03c00
-#define USHLL 0x2f00a400
-#define USHR 0x2f000400
-#define USRA 0x2f001400
-#define XTN 0x0e212800
-
-#define CSET (CSINC | RM(TMP_ZERO) | RN(TMP_ZERO))
-#define LDR (STRI | (1 << 22))
-#define LDRB (STRBI | (1 << 22))
-#define LDRH (LDRB | (1 << 30))
-#define MOV (ORR | RN(TMP_ZERO))
+#define ADC 0x9a000000
+#define ADD 0x8b000000
+#define ADDE 0x8b200000
+#define ADDI 0x91000000
+#define AND 0x8a000000
+#define ANDI 0x92000000
+#define AND_v 0x0e201c00
+#define ASRV 0x9ac02800
+#define B 0x14000000
+#define B_CC 0x54000000
+#define BL 0x94000000
+#define BLR 0xd63f0000
+#define BR 0xd61f0000
+#define BRK 0xd4200000
+#define CAS 0xc8a07c00
+#define CASB 0x08a07c00
+#define CASH 0x48a07c00
+#define CBZ 0xb4000000
+#define CCMPI 0xfa400800
+#define CLZ 0xdac01000
+#define CSEL 0x9a800000
+#define CSINC 0x9a800400
+#define DUP_e 0x0e000400
+#define DUP_g 0x0e000c00
+#define EOR 0xca000000
+#define EOR_v 0x2e201c00
+#define EORI 0xd2000000
+#define EXTR 0x93c00000
+#define FABS 0x1e60c000
+#define FADD 0x1e602800
+#define FCMP 0x1e602000
+#define FCSEL 0x1e600c00
+#define FCVT 0x1e224000
+#define FCVTL 0x0e217800
+#define FCVTZS 0x9e780000
+#define FDIV 0x1e601800
+#define FMOV 0x1e604000
+#define FMOV_R 0x9e660000
+#define FMOV_I 0x1e601000
+#define FMUL 0x1e600800
+#define FNEG 0x1e614000
+#define FSUB 0x1e603800
+#define INS 0x4e001c00
+#define INS_e 0x6e000400
+#define LD1 0x0c407000
+#define LD1_s 0x0d400000
+#define LD1R 0x0d40c000
+#define LDRI 0xf9400000
+#define LDRI_F64 0xfd400000
+#define LDRI_POST 0xf8400400
+#define LDP 0xa9400000
+#define LDP_F64 0x6d400000
+#define LDP_POST 0xa8c00000
+#define LDR_PRE 0xf8400c00
+#define LDXR 0xc85f7c00
+#define LDXRB 0x085f7c00
+#define LDXRH 0x485f7c00
+#define LSLV 0x9ac02000
+#define LSRV 0x9ac02400
+#define MADD 0x9b000000
+#define MOVI 0x0f000400
+#define MOVK 0xf2800000
+#define MOVN 0x92800000
+#define MOVZ 0xd2800000
+#define NOP 0xd503201f
+#define ORN 0xaa200000
+#define ORR 0xaa000000
+#define ORR_v 0x0ea01c00
+#define ORRI 0xb2000000
+#define RBIT 0xdac00000
+#define RET 0xd65f0000
+#define REV 0xdac00c00
+#define REV16 0xdac00400
+#define RORV 0x9ac02c00
+#define SBC 0xda000000
+#define SBFM 0x93400000
+#define SCVTF 0x9e620000
+#define SDIV 0x9ac00c00
+#define SMADDL 0x9b200000
+#define SMOV 0x0e002c00
+#define SMULH 0x9b403c00
+#define SSHLL 0x0f00a400
+#define ST1 0x0c007000
+#define ST1_s 0x0d000000
+#define STP 0xa9000000
+#define STP_F64 0x6d000000
+#define STP_PRE 0xa9800000
+#define STRB 0x38206800
+#define STRBI 0x39000000
+#define STRI 0xf9000000
+#define STRI_F64 0xfd000000
+#define STR_FI 0x3d000000
+#define STR_FR 0x3c206800
+#define STUR_FI 0x3c000000
+#define STURBI 0x38000000
+#define STXR 0xc8007c00
+#define STXRB 0x8007c00
+#define STXRH 0x48007c00
+#define SUB 0xcb000000
+#define SUBI 0xd1000000
+#define SUBS 0xeb000000
+#define TBZ 0x36000000
+#define UBFM 0xd3400000
+#define UCVTF 0x9e630000
+#define UDIV 0x9ac00800
+#define UMOV 0x0e003c00
+#define UMULH 0x9bc03c00
+#define USHLL 0x2f00a400
+#define USHR 0x2f000400
+#define USRA 0x2f001400
+#define XTN 0x0e212800
+
+#define CSET (CSINC | RM(TMP_ZERO) | RN(TMP_ZERO))
+#define LDR (STRI | (1 << 22))
+#define LDRB (STRBI | (1 << 22))
+#define LDRH (LDRB | (1 << 30))
+#define MOV (ORR | RN(TMP_ZERO))
static sljit_s32 push_inst(struct sljit_compiler *compiler, sljit_ins ins)
{
@@ -3044,6 +3046,43 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
return SLJIT_SUCCESS;
}
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_ins ins = 0;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
+
+ if (reg_size != 3 && reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+ return SLJIT_ERR_UNSUPPORTED;
+
+ switch (SLJIT_SIMD_GET_OPCODE(type)) {
+ case SLJIT_SIMD_OP2_AND:
+ ins = AND_v;
+ break;
+ case SLJIT_SIMD_OP2_OR:
+ ins = ORR_v;
+ break;
+ case SLJIT_SIMD_OP2_XOR:
+ ins = EOR_v;
+ break;
+ }
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (reg_size == 4)
+ ins |= (sljit_ins)1 << 30;
+
+ return push_inst(compiler, ins | VD(dst_freg) | VN(src1_freg) | VM(src2_freg));
+}
+
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
sljit_s32 dst_reg,
sljit_s32 mem_reg)
diff --git a/src/sljit/sljitNativeARM_T2_32.c b/src/sljit/sljitNativeARM_T2_32.c
index f914eb70..c27c50dd 100644
--- a/src/sljit/sljitNativeARM_T2_32.c
+++ b/src/sljit/sljitNativeARM_T2_32.c
@@ -41,26 +41,28 @@ typedef sljit_u32 sljit_ins;
#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
#define TMP_PC (SLJIT_NUMBER_OF_REGISTERS + 4)
-#define TMP_FREG1 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1)
-#define TMP_FREG2 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2)
+#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
/* See sljit_emit_enter and sljit_emit_op0 if you want to change them. */
static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
0, 0, 1, 2, 3, 11, 10, 9, 8, 7, 6, 5, 4, 13, 12, 14, 15
};
-static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+static const sljit_u8 freg_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = {
0,
0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6,
0, 1, 2, 3, 4, 5, 15, 14, 13, 12, 11, 10, 9, 8,
- 6, 7
+ 7, 6
};
-static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3] = {
+static const sljit_u8 freg_ebit_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2) << 1) + 1] = {
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0
+ 1, 1
};
#define COPY_BITS(src, from, to, bits) \
@@ -217,6 +219,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3]
#define UXTH_W 0xfa1ff080
#define VABS_F32 0xeeb00ac0
#define VADD_F32 0xee300a00
+#define VAND 0xef000110
#define VCMP_F32 0xeeb40a40
#define VCVT_F32_S32 0xeeb80ac0
#define VCVT_F32_U32 0xeeb80a40
@@ -225,6 +228,7 @@ static const sljit_u8 freg_ebit_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3]
#define VDIV_F32 0xee800a00
#define VDUP 0xee800b10
#define VDUP_s 0xffb00c00
+#define VEOR 0xff000110
#define VLD1 0xf9200000
#define VLD1_r 0xf9a00c00
#define VLD1_s 0xf9a00000
@@ -256,11 +260,12 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s
if (compiler->scratches == -1)
return 0;
- if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0))
- fr -= SLJIT_FS0;
+ if (is_32 && fr >= SLJIT_F64_SECOND(SLJIT_FR0))
+ fr -= SLJIT_F64_SECOND(0);
return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
- || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+ || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0)
+ || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS));
}
#endif /* SLJIT_ARGUMENT_CHECKS */
@@ -3426,7 +3431,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
{
sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
- sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type);
+ sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
sljit_ins ins;
CHECK_ERROR();
@@ -3456,7 +3461,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
ins = VD4(freg) | VN4(srcdst) | VM4(srcdst);
if (reg_size == 4)
- ins |= (1 << 6);
+ ins |= (sljit_ins)1 << 6;
return push_inst32(compiler, VORR | ins);
}
@@ -3633,7 +3638,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
ins = ((sljit_ins)freg_ebit_map[src] << (16 + 2 + 1)) | ((sljit_ins)1 << (16 + 2));
if (reg_size == 4)
- ins |= 1 << 6;
+ ins |= (sljit_ins)1 << 6;
return push_inst32(compiler, VDUP_s | ins | VD4(freg) | (sljit_ins)freg_map[src]);
}
@@ -3668,7 +3673,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
}
if (reg_size == 4)
- ins |= 1 << 21;
+ ins |= (sljit_ins)1 << 21;
return push_inst32(compiler, VDUP | ins | VN4(freg) | RT4(src));
}
@@ -3714,8 +3719,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
}
if (srcdst == freg || (elem_size == 3 && srcdst == (freg + SLJIT_QUAD_OTHER_HALF(freg)))) {
- FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG1) | VN4(freg) | VM4(freg)));
- srcdst = TMP_FREG1;
+ FAIL_IF(push_inst32(compiler, VORR | ins | VD4(TMP_FREG2) | VN4(freg) | VM4(freg)));
+ srcdst = TMP_FREG2;
srcdstw = 0;
}
}
@@ -3834,7 +3839,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
ins = ((((sljit_ins)src_lane_index << 1) | 1) << (16 + elem_size));
if (reg_size == 4)
- ins |= 1 << 6;
+ ins |= (sljit_ins)1 << 6;
return push_inst32(compiler, VDUP_s | ins | VD4(freg) | VM4(src));
}
@@ -3876,7 +3881,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler
src = simd_get_quad_reg_index(src);
if (!(type & SLJIT_SIMD_FLOAT)) {
- dst_reg = (reg_size == 4) ? freg : TMP_FREG1;
+ dst_reg = (reg_size == 4) ? freg : TMP_FREG2;
do {
FAIL_IF(push_inst32(compiler, VSHLL | ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0 : (1 << 28))
@@ -3884,8 +3889,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler
src = dst_reg;
} while (++elem_size < elem2_size);
- if (dst_reg == TMP_FREG1)
- return push_inst32(compiler, VORR | VD4(freg) | VN4(TMP_FREG1) | VM4(TMP_FREG1));
+ if (dst_reg == TMP_FREG2)
+ return push_inst32(compiler, VORR | VD4(freg) | VN4(TMP_FREG2) | VM4(TMP_FREG2));
return SLJIT_SUCCESS;
}
@@ -3948,30 +3953,30 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
if (reg_size == 4) {
freg = simd_get_quad_reg_index(freg);
- ins |= (1 << 6);
+ ins |= (sljit_ins)1 << 6;
}
- SLJIT_ASSERT((freg_map[TMP_FREG1] & 0x1) == 0);
- FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG1) | VM4(freg)));
+ SLJIT_ASSERT((freg_map[TMP_FREG2] & 0x1) == 0);
+ FAIL_IF(push_inst32(compiler, ins | VD4(TMP_FREG2) | VM4(freg)));
if (reg_size == 4 && elem_size > 0)
- FAIL_IF(push_inst32(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD4(TMP_FREG1) | VM4(TMP_FREG1)));
+ FAIL_IF(push_inst32(compiler, VMOVN | ((sljit_ins)(elem_size - 1) << 18) | VD4(TMP_FREG2) | VM4(TMP_FREG2)));
ins = (reg_size == 4 && elem_size == 0) ? (1 << 6) : 0;
while (imms >= 0x100) {
- FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | ((imms & 0xff) << 16) | VD4(TMP_FREG1) | VM4(TMP_FREG1)));
+ FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | ((imms & 0xff) << 16) | VD4(TMP_FREG2) | VM4(TMP_FREG2)));
imms >>= 8;
}
- FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | (1 << 7) | (imms << 16) | VD4(TMP_FREG1) | VM4(TMP_FREG1)));
+ FAIL_IF(push_inst32(compiler, VSRA | (1 << 28) | ins | (1 << 7) | (imms << 16) | VD4(TMP_FREG2) | VM4(TMP_FREG2)));
dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
- FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(dst_r) | VN4(TMP_FREG1)));
+ FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(dst_r) | VN4(TMP_FREG2)));
if (reg_size == 4 && elem_size == 0) {
- SLJIT_ASSERT(freg_map[TMP_FREG1] + 1 == freg_map[TMP_FREG2]);
- FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(TMP_REG2) | VN4(TMP_FREG2)));
+ SLJIT_ASSERT(freg_map[TMP_FREG2] + 1 == freg_map[TMP_FREG1]);
+ FAIL_IF(push_inst32(compiler, VMOV_s | (1 << 20) | (1 << 23) | (0x2 << 21) | RT4(TMP_REG2)| VN4(TMP_FREG1)));
FAIL_IF(push_inst32(compiler, ORR_W | RD4(dst_r) | RN4(dst_r) | RM4(TMP_REG2) | (0x2 << 12)));
}
@@ -3981,6 +3986,47 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
return SLJIT_SUCCESS;
}
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_ins ins = 0;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
+
+ if (reg_size != 3 && reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+ return SLJIT_ERR_UNSUPPORTED;
+
+ switch (SLJIT_SIMD_GET_OPCODE(type)) {
+ case SLJIT_SIMD_OP2_AND:
+ ins = VAND;
+ break;
+ case SLJIT_SIMD_OP2_OR:
+ ins = VORR;
+ break;
+ case SLJIT_SIMD_OP2_XOR:
+ ins = VEOR;
+ break;
+ }
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (reg_size == 4) {
+ dst_freg = simd_get_quad_reg_index(dst_freg);
+ src1_freg = simd_get_quad_reg_index(src1_freg);
+ src2_freg = simd_get_quad_reg_index(src2_freg);
+ ins |= (sljit_ins)1 << 6;
+ }
+
+ return push_inst32(compiler, ins | VD4(dst_freg) | VN4(src1_freg) | VM4(src2_freg));
+}
+
#undef FPU_LOAD
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
diff --git a/src/sljit/sljitNativeLOONGARCH_64.c b/src/sljit/sljitNativeLOONGARCH_64.c
index fe6fad47..dbd76054 100644
--- a/src/sljit/sljitNativeLOONGARCH_64.c
+++ b/src/sljit/sljitNativeLOONGARCH_64.c
@@ -26,9 +26,7 @@
SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
{
-#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
return "LOONGARCH" SLJIT_CPUINFO;
-#endif /* SLJIT_CONFIG_LOONGARCH_64 */
}
typedef sljit_u32 sljit_ins;
@@ -61,7 +59,7 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3] = {
/*
LoongArch instructions are 32 bits wide, belonging to 9 basic instruction formats (and variants of them):
-| Format name | Composition |
+| Format name | Composition |
| 2R | Opcode + Rj + Rd |
| 3R | Opcode + Rk + Rj + Rd |
| 4R | Opcode + Ra + Rk + Rj + Rd |
@@ -338,10 +336,10 @@ lower parts in the instruction word, denoted by the “L” and “H” suffixes
#define INST(inst, type) ((sljit_ins)((type & SLJIT_32) ? inst##_W : inst##_D))
/* LoongArch CPUCFG register for feature detection */
-#define LOONGARCH_CFG2 0x02
+#define LOONGARCH_CFG2 0x02
#define LOONGARCH_FEATURE_LAMCAS (1 << 28)
-sljit_u32 cpu_feature_list = 0;
+static sljit_u32 cpu_feature_list = 0;
static SLJIT_INLINE sljit_u32 get_cpu_features(void)
{
@@ -1610,9 +1608,10 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3
compiler->cache_argw = 0;
}
- if (dst == TMP_REG2) {
+ if (dst == 0) {
SLJIT_ASSERT(HAS_FLAGS(op));
flags |= UNUSED_DEST;
+ dst = TMP_REG2;
}
else if (FAST_IS_REG(dst)) {
dst_r = dst;
@@ -1891,7 +1890,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
SLJIT_SKIP_CHECKS(compiler);
- return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
+ return sljit_emit_op2(compiler, op, 0, 0, src1, src1w, src2, src2w);
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
@@ -2392,14 +2391,14 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compiler, sljit_s32 op,
- sljit_s32 dst,
+ sljit_s32 dst_freg,
sljit_s32 src1, sljit_sw src1w,
sljit_s32 src2, sljit_sw src2w)
{
sljit_s32 reg;
CHECK_ERROR();
- CHECK(check_sljit_emit_fop2r(compiler, op, dst, src1, src1w, src2, src2w));
+ CHECK(check_sljit_emit_fop2r(compiler, op, dst_freg, src1, src1w, src2, src2w));
ADJUST_LOCAL_OFFSET(src1, src1w);
ADJUST_LOCAL_OFFSET(src2, src2w);
@@ -2409,12 +2408,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compi
}
if (src1 & SLJIT_MEM) {
- reg = (dst == src2) ? TMP_FREG1 : dst;
+ reg = (dst_freg == src2) ? TMP_FREG1 : dst_freg;
FAIL_IF(emit_op_mem2(compiler, FLOAT_DATA(op) | LOAD_DATA, reg, src1, src1w, 0, 0));
src1 = reg;
}
- return push_inst(compiler, FINST(FCOPYSIGN, op) | FRD(dst) | FRJ(src1) | FRK(src2));
+ return push_inst(compiler, FINST(FCOPYSIGN, op) | FRD(dst_freg) | FRJ(src1) | FRK(src2));
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset32(struct sljit_compiler *compiler,
diff --git a/src/sljit/sljitNativeMIPS_common.c b/src/sljit/sljitNativeMIPS_common.c
index d80a75c2..807b3474 100644
--- a/src/sljit/sljitNativeMIPS_common.c
+++ b/src/sljit/sljitNativeMIPS_common.c
@@ -94,29 +94,26 @@ typedef sljit_u32 sljit_ins;
#define EQUAL_FLAG 3
#define OTHER_FLAG 1
-static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
- 0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 23, 22, 21, 20, 19, 18, 17, 16, 29, 4, 25, 31
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 7] = {
+ 0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 23, 22, 21, 20, 19, 18, 17, 16, 29, 4, 25, 31, 3, 1
};
-#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
+#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
+#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
+#define TMP_FREG3 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3)
-#define TMP_FREG1 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 1)
-#define TMP_FREG2 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 2)
-#define TMP_FREG3 ((SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 3)
+#if (defined SLJIT_CONFIG_MIPS_32 && SLJIT_CONFIG_MIPS_32)
-static const sljit_u8 freg_map[(SLJIT_NUMBER_OF_FLOAT_REGISTERS << 1) + 4] = {
+static const sljit_u8 freg_map[((SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3) << 1) + 1] = {
0,
0, 14, 2, 4, 6, 8, 18, 30, 28, 26, 24, 22, 20,
- 1, 15, 3, 5, 7, 9, 19, 31, 29, 27, 25, 23, 21,
12, 10, 16,
+ 1, 15, 3, 5, 7, 9, 19, 31, 29, 27, 25, 23, 21,
+ 13, 11, 17
};
#else /* !SLJIT_CONFIG_MIPS_32 */
-#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
-#define TMP_FREG2 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2)
-#define TMP_FREG3 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 3)
-
static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 4] = {
0, 0, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8, 9, 31, 30, 29, 28, 27, 26, 25, 24, 12, 11, 10
};
@@ -381,11 +378,12 @@ static sljit_s32 function_check_is_freg(struct sljit_compiler *compiler, sljit_s
if (compiler->scratches == -1)
return 0;
- if (is_32 && fr >= (SLJIT_FS0 + SLJIT_FR0) && fr <= (SLJIT_FS0 + SLJIT_FS0))
- fr -= SLJIT_FS0;
+ if (is_32 && fr >= SLJIT_F64_SECOND(SLJIT_FR0))
+ fr -= SLJIT_F64_SECOND(0);
return (fr >= SLJIT_FR0 && fr < (SLJIT_FR0 + compiler->fscratches))
- || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0);
+ || (fr > (SLJIT_FS0 - compiler->fsaveds) && fr <= SLJIT_FS0)
+ || (fr >= SLJIT_TMP_FREGISTER_BASE && fr < (SLJIT_TMP_FREGISTER_BASE + SLJIT_NUMBER_OF_TEMPORARY_FLOAT_REGISTERS));
}
#endif /* SLJIT_CONFIG_MIPS_32 && SLJIT_ARGUMENT_CHECKS */
@@ -2292,9 +2290,10 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3
compiler->cache_argw = 0;
}
- if (dst == TMP_REG2) {
+ if (dst == 0) {
SLJIT_ASSERT(HAS_FLAGS(op));
flags |= UNUSED_DEST;
+ dst = TMP_REG2;
}
else if (FAST_IS_REG(dst)) {
dst_r = dst;
@@ -2655,7 +2654,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
SLJIT_SKIP_CHECKS(compiler);
- return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
+ return sljit_emit_op2(compiler, op, 0, 0, src1, src1w, src2, src2w);
}
#if (defined SLJIT_CONFIG_MIPS_64 && SLJIT_CONFIG_MIPS_64)
diff --git a/src/sljit/sljitNativeRISCV_32.c b/src/sljit/sljitNativeRISCV_32.c
index 68b2a349..396c956c 100644
--- a/src/sljit/sljitNativeRISCV_32.c
+++ b/src/sljit/sljitNativeRISCV_32.c
@@ -27,7 +27,6 @@
static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r, sljit_sw imm, sljit_s32 tmp_r)
{
SLJIT_UNUSED_ARG(tmp_r);
- SLJIT_ASSERT(dst_r != tmp_r);
if (imm <= SIMM_MAX && imm >= SIMM_MIN)
return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm));
diff --git a/src/sljit/sljitNativeRISCV_64.c b/src/sljit/sljitNativeRISCV_64.c
index 18c2d59f..7fcf2c52 100644
--- a/src/sljit/sljitNativeRISCV_64.c
+++ b/src/sljit/sljitNativeRISCV_64.c
@@ -28,8 +28,6 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r
{
sljit_sw high;
- SLJIT_ASSERT(dst_r != tmp_r);
-
if (imm <= SIMM_MAX && imm >= SIMM_MIN)
return push_inst(compiler, ADDI | RD(dst_r) | RS1(TMP_ZERO) | IMM_I(imm));
@@ -81,6 +79,8 @@ static sljit_s32 load_immediate(struct sljit_compiler *compiler, sljit_s32 dst_r
return SLJIT_SUCCESS;
}
+ SLJIT_ASSERT(dst_r != tmp_r);
+
high = imm >> 32;
imm = (sljit_s32)imm;
diff --git a/src/sljit/sljitNativeRISCV_common.c b/src/sljit/sljitNativeRISCV_common.c
index 3b54ab92..64bd411d 100644
--- a/src/sljit/sljitNativeRISCV_common.c
+++ b/src/sljit/sljitNativeRISCV_common.c
@@ -348,13 +348,12 @@ static SLJIT_INLINE void load_addr_to_reg(void *dst, sljit_u32 reg)
if ((addr & 0x80000000l) != 0)
high = ~high;
- if ((high & 0x800) != 0)
- high += 0x1000;
-
if (flags & PATCH_ABS52) {
SLJIT_ASSERT(addr <= S52_MAX);
inst[0] = LUI | RD(TMP_REG3) | (sljit_ins)(high << 12);
} else {
+ if ((high & 0x800) != 0)
+ high += 0x1000;
inst[0] = LUI | RD(TMP_REG3) | (sljit_ins)(high & ~0xfff);
inst[1] = ADDI | RD(TMP_REG3) | RS1(TMP_REG3) | IMM_I(high);
inst++;
@@ -940,7 +939,7 @@ static sljit_s32 getput_arg(struct sljit_compiler *compiler, sljit_s32 flags, sl
/* Since tmp can be the same as base or offset registers,
* these might be unavailable after modifying tmp. */
- if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA))
+ if ((flags & MEM_MASK) <= GPR_REG && (flags & LOAD_DATA) && reg == TMP_REG2)
tmp_r = reg;
if (SLJIT_UNLIKELY(arg & OFFS_REG_MASK)) {
@@ -1639,9 +1638,10 @@ static sljit_s32 emit_op(struct sljit_compiler *compiler, sljit_s32 op, sljit_s3
compiler->cache_argw = 0;
}
- if (dst == TMP_REG2) {
+ if (dst == 0) {
SLJIT_ASSERT(HAS_FLAGS(op));
flags |= UNUSED_DEST;
+ dst = TMP_REG2;
}
else if (FAST_IS_REG(dst)) {
dst_r = dst;
@@ -1938,7 +1938,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2u(struct sljit_compiler *compil
CHECK(check_sljit_emit_op2(compiler, op, 1, 0, 0, src1, src1w, src2, src2w));
SLJIT_SKIP_CHECKS(compiler);
- return sljit_emit_op2(compiler, op, TMP_REG2, 0, src1, src1w, src2, src2w);
+ return sljit_emit_op2(compiler, op, 0, 0, src1, src1w, src2, src2w);
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_shift_into(struct sljit_compiler *compiler, sljit_s32 op,
diff --git a/src/sljit/sljitNativeS390X.c b/src/sljit/sljitNativeS390X.c
index 97521b50..67516f9b 100644
--- a/src/sljit/sljitNativeS390X.c
+++ b/src/sljit/sljitNativeS390X.c
@@ -47,8 +47,8 @@ static const sljit_ins sljit_ins_const = (sljit_ins)1 << 48;
#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
-static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
- 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1
+static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
+ 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0, 1, 14
};
/* there are also a[2-15] available, but they are slower to access and
@@ -83,7 +83,7 @@ static const sljit_gpr r10 = 10; /* reg_map[9] */
static const sljit_gpr r11 = 11; /* reg_map[10] */
static const sljit_gpr r12 = 12; /* reg_map[11]: GOT */
static const sljit_gpr r13 = 13; /* reg_map[12]: Literal Pool pointer */
-static const sljit_gpr r14 = 14; /* reg_map[0]: return address and flag register */
+static const sljit_gpr r14 = 14; /* reg_map[0]: return address */
static const sljit_gpr r15 = 15; /* reg_map[SLJIT_NUMBER_OF_REGISTERS + 1]: stack pointer */
/* WARNING: r12 and r13 shouldn't be used as per ABI recommendation */
@@ -96,20 +96,16 @@ static const sljit_gpr r15 = 15; /* reg_map[SLJIT_NUMBER_OF_REGISTERS + 1]: stac
#define tmp0 r0
#define tmp1 r1
-/* TODO(carenas): flags should move to a different register so that
- * link register doesn't need to change
- */
-
/* When reg cannot be unused. */
#define IS_GPR_REG(reg) ((reg > 0) && (reg) <= SLJIT_SP)
/* Link register. */
static const sljit_gpr link_r = 14; /* r14 */
-#define TMP_FREG1 (0)
+#define TMP_FREG1 (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
-static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
- 1, 0, 2, 4, 6, 3, 5, 7, 15, 14, 13, 12, 11, 10, 9, 8,
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
+ 0, 0, 2, 4, 6, 3, 5, 7, 15, 14, 13, 12, 11, 10, 9, 8, 1
};
#define R0A(r) (r)
@@ -128,6 +124,8 @@ static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
#define F4(r) (R4A((sljit_ins)freg_map[r]))
#define F12(r) (R12A((sljit_ins)freg_map[r]))
#define F20(r) (R20A((sljit_ins)freg_map[r]))
+#define F28(r) (R28A((sljit_ins)freg_map[r]))
+#define F32(r) (R32A((sljit_ins)freg_map[r]))
#define F36(r) (R36A((sljit_ins)freg_map[r]))
struct sljit_s390x_const {
@@ -898,23 +896,17 @@ static sljit_s32 push_load_imm_inst(struct sljit_compiler *compiler, sljit_gpr t
if (((sljit_uw)v & ~(sljit_uw)0xffff000000000000) == 0)
return push_inst(compiler, llihh(target, (sljit_u16)(v >> 48)));
- /* 6 byte instructions (requires extended immediate facility) */
- if (have_eimm()) {
- if (is_s32(v))
- return push_inst(compiler, lgfi(target, (sljit_s32)v));
+ if (is_s32(v))
+ return push_inst(compiler, lgfi(target, (sljit_s32)v));
- if (((sljit_uw)v >> 32) == 0)
- return push_inst(compiler, llilf(target, (sljit_u32)v));
+ if (((sljit_uw)v >> 32) == 0)
+ return push_inst(compiler, llilf(target, (sljit_u32)v));
- if (((sljit_uw)v << 32) == 0)
- return push_inst(compiler, llihf(target, (sljit_u32)((sljit_uw)v >> 32)));
-
- FAIL_IF(push_inst(compiler, llilf(target, (sljit_u32)v)));
- return push_inst(compiler, iihf(target, (sljit_u32)(v >> 32)));
- }
+ if (((sljit_uw)v << 32) == 0)
+ return push_inst(compiler, llihf(target, (sljit_u32)((sljit_uw)v >> 32)));
- /* TODO(mundaym): instruction sequences that don't use extended immediates */
- abort();
+ FAIL_IF(push_inst(compiler, llilf(target, (sljit_u32)v)));
+ return push_inst(compiler, iihf(target, (sljit_u32)(v >> 32)));
}
struct addr {
@@ -1677,6 +1669,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
case SLJIT_HAS_PREFETCH:
case SLJIT_HAS_COPY_F32:
case SLJIT_HAS_COPY_F64:
+ case SLJIT_HAS_SIMD:
case SLJIT_HAS_ATOMIC:
return 1;
@@ -3955,6 +3948,446 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_mem(struct sljit_compiler *compile
return push_inst(compiler, ins | R36A(reg2) | disp_s20((sljit_s32)memw + SSIZE_OF(sw)));
}
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 freg,
+ sljit_s32 srcdst, sljit_sw srcdstw)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+ struct addr addr;
+ sljit_ins ins;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
+
+ ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (!(srcdst & SLJIT_MEM)) {
+ if (type & SLJIT_SIMD_STORE)
+ ins = F36(srcdst) | F32(freg);
+ else
+ ins = F36(freg) | F32(srcdst);
+
+ return push_inst(compiler, 0xe70000000056 /* vlr */ | ins);
+ }
+
+ FAIL_IF(make_addr_bx(compiler, &addr, srcdst, srcdstw, tmp1));
+ ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset);
+
+ if (alignment >= 4)
+ ins |= 4 << 12;
+ else if (alignment == 3)
+ ins |= 3 << 12;
+
+ return push_inst(compiler, ((type & SLJIT_SIMD_STORE) ? 0xe7000000000e /* vst */ : 0xe70000000006 /* vl */) | ins);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 freg,
+ sljit_s32 src, sljit_sw srcw)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ struct addr addr;
+ sljit_gpr reg;
+ sljit_sw sign_ext;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_replicate(compiler, type, freg, src, srcw));
+
+ ADJUST_LOCAL_OFFSET(src, srcw);
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (src & SLJIT_MEM) {
+ FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1));
+ return push_inst(compiler, 0xe70000000005 /* vlrep */ | F36(freg)
+ | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset) | ((sljit_ins)elem_size << 12));
+ }
+
+ if (type & SLJIT_SIMD_FLOAT) {
+ if (src == SLJIT_IMM)
+ return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg));
+
+ return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(src) | ((sljit_ins)elem_size << 12));
+ }
+
+ if (src == SLJIT_IMM) {
+ sign_ext = 0x10000;
+
+ switch (elem_size) {
+ case 0:
+ srcw &= 0xff;
+ sign_ext = (sljit_s8)srcw;
+ break;
+ case 1:
+ srcw &= 0xffff;
+ sign_ext = (sljit_s16)srcw;
+ break;
+ case 2:
+ if ((sljit_s32)srcw == (sljit_s16)srcw) {
+ srcw &= 0xffff;
+ sign_ext = (sljit_s16)srcw;
+ } else
+ srcw &= 0xffffffff;
+ break;
+ default:
+ if (srcw == (sljit_s16)srcw) {
+ srcw &= 0xffff;
+ sign_ext = (sljit_s16)srcw;
+ }
+ break;
+ }
+
+ if (sign_ext != 0x10000) {
+ if (sign_ext == 0 || sign_ext == -1)
+ return push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg)
+ | (sign_ext == 0 ? 0 : ((sljit_ins)0xffff << 16)));
+
+ return push_inst(compiler, 0xe70000000045 /* vrepi */ | F36(freg)
+ | ((sljit_ins)srcw << 16) | ((sljit_ins)elem_size << 12));
+ }
+
+ push_load_imm_inst(compiler, tmp0, srcw);
+ reg = tmp0;
+ } else
+ reg = gpr(src);
+
+ FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(reg) | ((sljit_ins)elem_size << 12)));
+ return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(freg) | ((sljit_ins)elem_size << 12));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 freg, sljit_s32 lane_index,
+ sljit_s32 srcdst, sljit_sw srcdstw)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ struct addr addr;
+ sljit_gpr reg;
+ sljit_ins ins = 0;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_lane_mov(compiler, type, freg, lane_index, srcdst, srcdstw));
+
+ ADJUST_LOCAL_OFFSET(srcdst, srcdstw);
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (srcdst & SLJIT_MEM) {
+ FAIL_IF(make_addr_bx(compiler, &addr, srcdst, srcdstw, tmp1));
+ ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset);
+ }
+
+ if (type & SLJIT_SIMD_LANE_ZERO) {
+ if ((srcdst & SLJIT_MEM) && lane_index == ((1 << (3 - elem_size)) - 1))
+ return push_inst(compiler, 0xe70000000004 /* vllez */ | ins | ((sljit_ins)elem_size << 12));
+
+ if ((type & SLJIT_SIMD_FLOAT) && freg == srcdst) {
+ FAIL_IF(push_inst(compiler, 0xe70000000056 /* vlr */ | F36(TMP_FREG1) | F32(freg)));
+ srcdst = TMP_FREG1;
+ srcdstw = 0;
+ }
+
+ FAIL_IF(push_inst(compiler, 0xe70000000044 /* vgbm */ | F36(freg)));
+ }
+
+ if (srcdst & SLJIT_MEM) {
+ switch (elem_size) {
+ case 0:
+ ins |= 0xe70000000000 /* vleb */;
+ break;
+ case 1:
+ ins |= 0xe70000000001 /* vleh */;
+ break;
+ case 2:
+ ins |= 0xe70000000003 /* vlef */;
+ break;
+ default:
+ ins |= 0xe70000000002 /* vleg */;
+ break;
+ }
+
+ /* Convert to vsteb - vsteg */
+ if (type & SLJIT_SIMD_STORE)
+ ins |= 0x8;
+
+ return push_inst(compiler, ins | ((sljit_ins)lane_index << 12));
+ }
+
+ if (type & SLJIT_SIMD_FLOAT) {
+ if (type & SLJIT_SIMD_STORE)
+ return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(srcdst) | F32(freg) | ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12));
+
+ if (elem_size == 3) {
+ if (lane_index == 0)
+ ins = F32(srcdst) | F28(freg) | (1 << 12);
+ else
+ ins = F32(freg) | F28(srcdst);
+
+ return push_inst(compiler, 0xe70000000084 /* vpdi */ | F36(freg) | ins);
+ }
+
+ FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(tmp0) | F32(srcdst) | ((sljit_ins)2 << 12)));
+ return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(tmp0) | ((sljit_ins)lane_index << 16) | ((sljit_ins)2 << 12));
+ }
+
+ if (srcdst == SLJIT_IMM) {
+ switch (elem_size) {
+ case 0:
+ ins = 0xe70000000040 /* vleib */;
+ srcdstw &= 0xff;
+ break;
+ case 1:
+ ins = 0xe70000000041 /* vleih */;
+ srcdstw &= 0xffff;
+ break;
+ case 2:
+ if ((sljit_s32)srcdstw == (sljit_s16)srcdstw) {
+ srcdstw &= 0xffff;
+ ins = 0xe70000000043 /* vleif */;
+ } else
+ srcdstw &= 0xffffffff;
+ break;
+ default:
+ if (srcdstw == (sljit_s16)srcdstw) {
+ srcdstw &= 0xffff;
+ ins = 0xe70000000042 /* vleig */;
+ }
+ break;
+ }
+
+ if (ins != 0)
+ return push_inst(compiler, ins | F36(freg) | ((sljit_ins)srcdstw << 16) | ((sljit_ins)lane_index << 12));
+
+ push_load_imm_inst(compiler, tmp0, srcdstw);
+ reg = tmp0;
+ } else
+ reg = gpr(srcdst);
+
+ ins = ((sljit_ins)lane_index << 16) | ((sljit_ins)elem_size << 12);
+
+ if (!(type & SLJIT_SIMD_STORE))
+ return push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(freg) | R32A(reg) | ins);
+
+ FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(reg) | F32(freg) | ins));
+
+ if (!(type & SLJIT_SIMD_LANE_SIGNED) || elem_size >= 3)
+ return SLJIT_SUCCESS;
+
+ switch (elem_size) {
+ case 0:
+ ins = 0xb9060000 /* lgbr */;
+ break;
+ case 1:
+ ins = 0xb9070000 /* lghr */;
+ break;
+ default:
+ ins = 0xb9140000 /* lgfr */;
+ break;
+ }
+
+ return push_inst(compiler, ins | R4A(reg) | R0A(reg));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 freg,
+ sljit_s32 src, sljit_s32 src_lane_index)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_lane_replicate(compiler, type, freg, src, src_lane_index));
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ return push_inst(compiler, 0xe7000000004d /* vrep */ | F36(freg) | F32(src)
+ | ((sljit_ins)src_lane_index << 16) | ((sljit_ins)elem_size << 12));
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 freg,
+ sljit_s32 src, sljit_sw srcw)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_s32 elem2_size = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+ struct addr addr;
+ sljit_ins ins;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_extend(compiler, type, freg, src, srcw));
+
+ ADJUST_LOCAL_OFFSET(src, srcw);
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ if (src & SLJIT_MEM) {
+ FAIL_IF(make_addr_bx(compiler, &addr, src, srcw, tmp1));
+ ins = F36(freg) | R32A(addr.index) | R28A(addr.base) | disp_s20(addr.offset);
+
+ switch (elem2_size - elem_size) {
+ case 1:
+ ins |= 0xe70000000002 /* vleg */;
+ break;
+ case 2:
+ ins |= 0xe70000000003 /* vlef */;
+ break;
+ default:
+ ins |= 0xe70000000001 /* vleh */;
+ break;
+ }
+
+ FAIL_IF(push_inst(compiler, ins));
+ src = freg;
+ }
+
+ if (type & SLJIT_SIMD_FLOAT) {
+ FAIL_IF(push_inst(compiler, 0xe700000000d5 /* vuplh */ | F36(freg) | F32(src) | (2 << 12)));
+ FAIL_IF(push_inst(compiler, 0xe70000000030 /* vesl */ | F36(freg) | F32(freg) | (32 << 16) | (3 << 12)));
+ return push_inst(compiler, 0xe700000000c4 /* vfll */ | F36(freg) | F32(freg) | (2 << 12));
+ }
+
+ ins = ((type & SLJIT_SIMD_EXTEND_SIGNED) ? 0xe700000000d7 /* vuph */ : 0xe700000000d5 /* vuplh */) | F36(freg);
+
+ do {
+ FAIL_IF(push_inst(compiler, ins | F32(src) | ((sljit_ins)elem_size << 12)));
+ src = freg;
+ } while (++elem_size < elem2_size);
+
+ return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 freg,
+ sljit_s32 dst, sljit_sw dstw)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_gpr dst_r;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_sign(compiler, type, freg, dst, dstw));
+
+ ADJUST_LOCAL_OFFSET(dst, dstw);
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && elem_size < 2)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ switch (elem_size) {
+ case 0:
+ push_load_imm_inst(compiler, tmp0, (sljit_sw)0x4048505860687078);
+ push_load_imm_inst(compiler, tmp1, (sljit_sw)0x0008101820283038);
+ FAIL_IF(push_inst(compiler, 0xe70000000062 /* vlvgp */ | F36(TMP_FREG1) | R32A(tmp1) | R28A(tmp0)));
+ break;
+ case 1:
+ push_load_imm_inst(compiler, tmp0, (sljit_sw)0x0010203040506070);
+ break;
+ case 2:
+ push_load_imm_inst(compiler, tmp0, (sljit_sw)0x8080808000204060);
+ break;
+ default:
+ push_load_imm_inst(compiler, tmp0, (sljit_sw)0x8080808080800040);
+ break;
+ }
+
+ if (elem_size != 0)
+ FAIL_IF(push_inst(compiler, 0xe70000000022 /* vlvg */ | F36(TMP_FREG1) | R32A(tmp0) | (1 << 16) | (3 << 12)));
+
+ FAIL_IF(push_inst(compiler, 0xe70000000085 /* vbperm */ | F36(TMP_FREG1) | F32(freg) | F28(TMP_FREG1)));
+
+ dst_r = FAST_IS_REG(dst) ? gpr(dst) : tmp0;
+ FAIL_IF(push_inst(compiler, 0xe70000000021 /* vlgv */ | R36A(dst_r) | F32(TMP_FREG1)
+ | (elem_size == 0 ? ((3 << 16) | (1 << 12)) : (7 << 16))));
+
+ if (dst_r == tmp0)
+ return store_word(compiler, tmp0, dst, dstw, type & SLJIT_32);
+
+ return SLJIT_SUCCESS;
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_ins ins = 0;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
+
+ if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ switch (SLJIT_SIMD_GET_OPCODE(type)) {
+ case SLJIT_SIMD_OP2_AND:
+ ins = 0xe70000000068 /* vn */;
+ break;
+ case SLJIT_SIMD_OP2_OR:
+ ins = 0xe7000000006a /* vo */;
+ break;
+ case SLJIT_SIMD_OP2_XOR:
+ ins = 0xe7000000006d /* vx */;
+ break;
+ }
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ return push_inst(compiler, ins | F36(dst_freg) | F32(src1_freg) | F28(src2_freg));
+}
+
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
sljit_s32 dst_reg,
sljit_s32 mem_reg)
diff --git a/src/sljit/sljitNativeX86_32.c b/src/sljit/sljitNativeX86_32.c
index 1bba883c..ba4a1ebb 100644
--- a/src/sljit/sljitNativeX86_32.c
+++ b/src/sljit/sljitNativeX86_32.c
@@ -148,7 +148,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw
else if (!(flags & EX86_SSE2_OP1))
*buf_ptr = U8(reg_map[a] << 3);
else
- *buf_ptr = U8(a << 3);
+ *buf_ptr = U8(freg_map[a] << 3);
} else {
if (a == SLJIT_IMM) {
if (imma == 1)
@@ -161,7 +161,7 @@ static sljit_u8* emit_x86_instruction(struct sljit_compiler *compiler, sljit_uw
}
if (!(b & SLJIT_MEM)) {
- *buf_ptr = U8(*buf_ptr | MOD_REG | (!(flags & EX86_SSE2_OP2) ? reg_map[b] : b));
+ *buf_ptr = U8(*buf_ptr | MOD_REG | (!(flags & EX86_SSE2_OP2) ? reg_map[b] : freg_map[b]));
buf_ptr++;
} else if (b & REG_MASK) {
reg_map_b = reg_map[b & REG_MASK];
@@ -257,7 +257,7 @@ static sljit_s32 emit_vex_instruction(struct sljit_compiler *compiler, sljit_uw
if (op & VEX_256)
vex |= 0x4;
- vex = U8(vex | ((((op & VEX_SSE2_OPV) ? v : reg_map[v]) ^ 0xf) << 3));
+ vex = U8(vex | ((((op & VEX_SSE2_OPV) ? freg_map[v] : reg_map[v]) ^ 0xf) << 3));
size = op & ~(sljit_uw)0xff;
size |= (vex_m == 0) ? 3 : 4;
@@ -1351,7 +1351,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
FAIL_IF(!inst);
inst[1] |= SHR;
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_PREF_F2 | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_PREF_F2 | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
FAIL_IF(!inst);
@@ -1359,7 +1359,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
inst[0] = U8(get_jump_code(SLJIT_NOT_CARRY) - 0x10);
size1 = compiler->size;
- FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_PREF_F2 | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)&f64_high_bit));
+ FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_PREF_F2 | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)&f64_high_bit));
inst[1] = U8(compiler->size - size1);
@@ -1383,7 +1383,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
size1 = compiler->size;
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0));
inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
FAIL_IF(!inst);
@@ -1413,8 +1413,8 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
BINARY_IMM32(OR, 1, TMP_REG1, 0);
jump_inst1[1] = U8(compiler->size - size1);
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
- FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
+ FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0));
jump_inst2[1] = U8(compiler->size - size2);
@@ -1475,13 +1475,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset64(struct sljit_compiler *comp
if (u.imm[0] == 0) {
if (u.imm[1] == 0)
- return emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
+ return emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[1]);
} else
EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[0]);
- FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, TMP_REG1, 0));
+ FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, TMP_REG1, 0));
if (u.imm[1] == 0)
return SLJIT_SUCCESS;
@@ -1504,11 +1504,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fset64(struct sljit_compiler *comp
EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, u.imm[1]);
if (cpu_feature_list & CPU_FEATURE_SSE41) {
- FAIL_IF(emit_groupf_ext(compiler, PINSRD_x_rm_i8, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, TMP_REG1, 0));
+ FAIL_IF(emit_groupf_ext(compiler, PINSRD_x_rm_i8 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, TMP_REG1, 0));
return emit_byte(compiler, 1);
}
- FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, TMP_REG1, 0));
+ FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, TMP_REG1, 0));
tmp_freg = TMP_FREG;
}
@@ -1545,15 +1545,15 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi
CHECK_EXTRA_REGS(reg, regw, (void)0);
- FAIL_IF(emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x,
- EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw));
+ FAIL_IF(emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x)
+ | EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw));
} else
reg2 = reg;
CHECK_EXTRA_REGS(reg2, reg2w, (void)0);
- FAIL_IF(emit_groupf_ext(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? PINSRD_x_rm_i8 : PEXTRD_rm_x_i8,
- EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, reg2, reg2w));
+ FAIL_IF(emit_groupf_ext(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? PINSRD_x_rm_i8 : PEXTRD_rm_x_i8)
+ | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2_OP1, freg, reg2, reg2w));
return emit_byte(compiler, 1);
}
@@ -1570,8 +1570,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi
CHECK_EXTRA_REGS(reg, regw, (void)0);
if (op & SLJIT_32)
- return emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x,
- EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw);
+ return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x)
+ | EX86_PREF_66 | EX86_SSE2_OP1, freg, reg, regw);
if (op == SLJIT_COPY_FROM_F64) {
inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
@@ -1584,11 +1584,11 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi
inst[3] = U8(MOD_REG | (TMP_FREG << 3) | freg);
inst[4] = 1;
} else if (reg != 0)
- FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw));
+ FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw));
if (reg2 != 0)
- FAIL_IF(emit_groupf(compiler, GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x,
- EX86_PREF_66 | EX86_SSE2_OP1, freg, reg2, reg2w));
+ FAIL_IF(emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_COPY_TO_F64 ? MOVD_x_rm : MOVD_rm_x)
+ | EX86_PREF_66 | EX86_SSE2_OP1, freg, reg2, reg2w));
if (GET_OPCODE(op) == SLJIT_COPY_TO_F64) {
inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
@@ -1599,7 +1599,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fcopy(struct sljit_compiler *compi
inst[1] = UNPCKLPS_x_xm;
inst[2] = U8(MOD_REG | (freg << 3) | (reg == 0 ? freg : TMP_FREG));
} else
- FAIL_IF(emit_groupf(compiler, MOVD_rm_x, EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw));
+ FAIL_IF(emit_groupf(compiler, MOVD_rm_x | EX86_PREF_66 | EX86_SSE2_OP1, TMP_FREG, reg, regw));
return SLJIT_SUCCESS;
}
diff --git a/src/sljit/sljitNativeX86_64.c b/src/sljit/sljitNativeX86_64.c
index 39114c22..f313f3f0 100644
--- a/src/sljit/sljitNativeX86_64.c
+++ b/src/sljit/sljitNativeX86_64.c
@@ -611,12 +611,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_enter(struct sljit_compiler *compi
tmp = SLJIT_FS0 - fsaveds;
for (i = SLJIT_FS0; i > tmp; i--) {
- FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
+ FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
saved_float_regs_offset += 16;
}
for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
- FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
+ FAIL_IF(emit_groupf(compiler, MOVAPS_xm_x | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
saved_float_regs_offset += 16;
}
}
@@ -674,12 +674,12 @@ static sljit_s32 emit_stack_frame_release(struct sljit_compiler *compiler, sljit
tmp = SLJIT_FS0 - fsaveds;
for (i = SLJIT_FS0; i > tmp; i--) {
- FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
+ FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
saved_float_regs_offset += 16;
}
for (i = fscratches; i >= SLJIT_FIRST_SAVED_FLOAT_REG; i--) {
- FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
+ FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | EX86_SSE2, i, SLJIT_MEM1(SLJIT_SP), saved_float_regs_offset));
saved_float_regs_offset += 16;
}
@@ -1155,7 +1155,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
} else
FAIL_IF(emit_do_imm32(compiler, reg_map[TMP_REG1] <= 7 ? 0 : REX_B, U8(MOV_r_i32 | reg_lmap[TMP_REG1]), srcw));
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
compiler->mode32 = 1;
@@ -1180,7 +1180,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
size1 = compiler->size;
compiler->mode32 = 0;
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, 0));
inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
FAIL_IF(!inst);
@@ -1209,9 +1209,9 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_uw(struct sljit_comp
FAIL_IF(!inst);
inst[0] = OR_r_rm;
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, TMP_REG1, 0));
compiler->mode32 = 1;
- FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0));
+ FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, dst_r, 0));
jump_inst2[1] = U8(compiler->size - size2);
diff --git a/src/sljit/sljitNativeX86_common.c b/src/sljit/sljitNativeX86_common.c
index cc330d47..369d8285 100644
--- a/src/sljit/sljitNativeX86_common.c
+++ b/src/sljit/sljitNativeX86_common.c
@@ -61,17 +61,20 @@ SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
15 - R15
*/
-#define TMP_FREG (0)
+#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
+#define TMP_FREG (SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1)
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
-/* Last register + 1. */
-#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 5, 7, 6, 4, 3
};
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 0
+};
+
#define CHECK_EXTRA_REGS(p, w, do) \
if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
w = (2 * SSIZE_OF(sw)) + ((p) - SLJIT_R3) * SSIZE_OF(sw); \
@@ -81,8 +84,6 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
#else /* SLJIT_CONFIG_X86_32 */
-/* Last register + 1. */
-#define TMP_REG1 (SLJIT_NUMBER_OF_REGISTERS + 2)
#define TMP_REG2 (SLJIT_NUMBER_OF_REGISTERS + 3)
/* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
@@ -95,7 +96,7 @@ static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
};
/* low-map. reg_map & 0x7. */
static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
- 0, 0, 6, 7, 1, 0, 3, 2, 4, 5, 5, 6, 7, 3, 4, 2, 1
+ 0, 0, 6, 7, 1, 0, 3, 2, 4, 5, 5, 6, 7, 3, 4, 2, 1
};
#else
/* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
@@ -109,12 +110,12 @@ static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
#endif
/* Args: xmm0-xmm3 */
-static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
- 4, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
+ 0, 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4
};
/* low-map. freg_map & 0x7. */
-static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
- 4, 0, 1, 2, 3, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
+static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 2] = {
+ 0, 0, 1, 2, 3, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4
};
#define REX_W 0x48
@@ -266,8 +267,10 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
#define OR_EAX_i32 0x0d
#define OR_rm_r 0x09
#define OR_rm8_r8 0x08
+#define ORPD_x_xm 0x56
#define PACKSSWB_x_xm (/* GROUP_0F */ 0x63)
-#define PCMPEQB_x_xm 0x74
+#define PAND_x_xm 0xdb
+#define PCMPEQD_x_xm 0x76
#define PINSRB_x_rm_i8 0x20
#define PINSRW_x_rm_i8 0xc4
#define PINSRD_x_rm_i8 0x22
@@ -290,11 +293,14 @@ static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
#define POP_r 0x58
#define POP_rm 0x8f
#define POPF 0x9d
+#define POR_x_xm 0xeb
#define PREFETCH 0x18
#define PSHUFB_x_xm 0x00
#define PSHUFD_x_xm 0x70
#define PSHUFLW_x_xm 0x70
#define PSRLDQ_x 0x73
+#define PSLLD_x_i8 0x72
+#define PSLLQ_x_i8 0x73
#define PUSH_i32 0x68
#define PUSH_r 0x50
#define PUSH_rm (/* GROUP_FF */ 6 << 3)
@@ -426,9 +432,11 @@ static void execute_cpu_id(sljit_u32 info[4])
"movl %0, %%esi\n"
"movl (%%esi), %%eax\n"
"movl 8(%%esi), %%ecx\n"
+ "pushl %%ebx\n"
"cpuid\n"
"movl %%eax, (%%esi)\n"
"movl %%ebx, 4(%%esi)\n"
+ "popl %%ebx\n"
"movl %%ecx, 8(%%esi)\n"
"movl %%edx, 12(%%esi)\n"
#else /* !SLJIT_CONFIG_X86_32 */
@@ -444,7 +452,7 @@ static void execute_cpu_id(sljit_u32 info[4])
:
: "r" (info)
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
- : "memory", "eax", "ebx", "ecx", "edx", "esi"
+ : "memory", "eax", "ecx", "edx", "esi"
#else /* !SLJIT_CONFIG_X86_32 */
: "memory", "rax", "rbx", "rcx", "rdx", "rsi"
#endif /* SLJIT_CONFIG_X86_32 */
@@ -937,11 +945,11 @@ static sljit_s32 emit_mov(struct sljit_compiler *compiler,
FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
- sljit_u8 opcode, sljit_uw pref,
+ sljit_uw op,
sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
- sljit_u8 opcode, sljit_uw pref,
+ sljit_uw op,
sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
@@ -1375,7 +1383,7 @@ static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
#endif /* !SLJIT_CONFIG_X86_32 */
/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
- FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, 0, dst_r, src, srcw));
+ FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm8 : MOVZX_r_rm8, dst_r, src, srcw));
}
if (dst & SLJIT_MEM) {
@@ -1444,7 +1452,7 @@ static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
dst_r = src;
else
- FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, 0, dst_r, src, srcw));
+ FAIL_IF(emit_groupf(compiler, sign ? MOVSX_r_rm16 : MOVZX_r_rm16, dst_r, src, srcw));
if (dst & SLJIT_MEM) {
inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
@@ -1506,14 +1514,14 @@ static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
if (is_clz ? (cpu_feature_list & CPU_FEATURE_LZCNT) : (cpu_feature_list & CPU_FEATURE_TZCNT)) {
- FAIL_IF(emit_groupf(compiler, is_clz ? LZCNT_r_rm : TZCNT_r_rm, EX86_PREF_F3, dst_r, src, srcw));
+ FAIL_IF(emit_groupf(compiler, (is_clz ? LZCNT_r_rm : TZCNT_r_rm) | EX86_PREF_F3, dst_r, src, srcw));
if (dst & SLJIT_MEM)
EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
return SLJIT_SUCCESS;
}
- FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, 0, dst_r, src, srcw));
+ FAIL_IF(emit_groupf(compiler, is_clz ? BSR_r_rm : BSF_r_rm, dst_r, src, srcw));
#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
max = is_clz ? (32 + 31) : 32;
@@ -1546,7 +1554,7 @@ static sljit_s32 emit_clz_ctz(struct sljit_compiler *compiler, sljit_s32 is_clz,
if (cpu_feature_list & CPU_FEATURE_CMOV) {
EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, max);
- FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, 0, dst_r, TMP_REG2, 0));
+ FAIL_IF(emit_groupf(compiler, CMOVE_r_rm, dst_r, TMP_REG2, 0));
} else
FAIL_IF(emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, max));
@@ -1991,9 +1999,9 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
/* Register destination. */
if (dst_r == src1 && src2 != SLJIT_IMM) {
- FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
} else if (dst_r == src2 && src1 != SLJIT_IMM) {
- FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src1, src1w));
+ FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src1, src1w));
} else if (src1 == SLJIT_IMM) {
if (src2 == SLJIT_IMM) {
EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
@@ -2032,7 +2040,7 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
if (dst_r != src2)
EMIT_MOV(compiler, dst_r, 0, src2, src2w);
FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
- FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, TMP_REG2, 0));
+ FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
}
#endif
}
@@ -2071,7 +2079,7 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
if (dst_r != src1)
EMIT_MOV(compiler, dst_r, 0, src1, src1w);
FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
- FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, TMP_REG2, 0));
+ FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, TMP_REG2, 0));
}
#endif
} else {
@@ -2079,7 +2087,7 @@ static sljit_s32 emit_mul(struct sljit_compiler *compiler,
if (ADDRESSING_DEPENDS_ON(src2, dst_r))
dst_r = TMP_REG1;
EMIT_MOV(compiler, dst_r, 0, src1, src1w);
- FAIL_IF(emit_groupf(compiler, IMUL_r_rm, 0, dst_r, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, IMUL_r_rm, dst_r, src2, src2w));
}
if (dst & SLJIT_MEM)
@@ -2820,11 +2828,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, slji
if (type != SLJIT_FLOAT_REGISTER && type != SLJIT_SIMD_REG_128 && type != SLJIT_SIMD_REG_256 && type != SLJIT_SIMD_REG_512)
return -1;
-#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
- return reg;
-#else /* !SLJIT_CONFIG_X86_32 */
return freg_map[reg];
-#endif /* SLJIT_CONFIG_X86_32 */
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
@@ -2868,42 +2872,42 @@ static void init_compiler(void)
}
static sljit_s32 emit_groupf(struct sljit_compiler *compiler,
- sljit_u8 opcode, sljit_uw pref,
+ sljit_uw op,
sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
{
- sljit_u8 *inst = emit_x86_instruction(compiler, 2 | pref, dst, 0, src, srcw);
+ sljit_u8 *inst = emit_x86_instruction(compiler, 2 | (op & ~(sljit_uw)0xff), dst, 0, src, srcw);
FAIL_IF(!inst);
inst[0] = GROUP_0F;
- inst[1] = opcode;
+ inst[1] = op & 0xff;
return SLJIT_SUCCESS;
}
static sljit_s32 emit_groupf_ext(struct sljit_compiler *compiler,
- sljit_u8 opcode, sljit_uw pref,
+ sljit_uw op,
sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
{
sljit_u8 *inst;
- SLJIT_ASSERT((pref & EX86_SSE2) && ((pref & VEX_OP_0F38) || (pref & VEX_OP_0F3A)));
+ SLJIT_ASSERT((op & EX86_SSE2) && ((op & VEX_OP_0F38) || (op & VEX_OP_0F3A)));
- inst = emit_x86_instruction(compiler, 3 | (pref & ~(VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw);
+ inst = emit_x86_instruction(compiler, 3 | (op & ~((sljit_uw)0xff | VEX_OP_0F38 | VEX_OP_0F3A)), dst, 0, src, srcw);
FAIL_IF(!inst);
inst[0] = GROUP_0F;
- inst[1] = U8((pref & VEX_OP_0F38) ? 0x38 : 0x3A);
- inst[2] = opcode;
+ inst[1] = U8((op & VEX_OP_0F38) ? 0x38 : 0x3A);
+ inst[2] = op & 0xff;
return SLJIT_SUCCESS;
}
static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
{
- return emit_groupf(compiler, MOVSD_x_xm, (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw);
+ return emit_groupf(compiler, MOVSD_x_xm | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, dst, src, srcw);
}
static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
{
- return emit_groupf(compiler, MOVSD_xm_x, (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw);
+ return emit_groupf(compiler, MOVSD_xm_x | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, src, dst, dstw);
}
static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
@@ -2920,7 +2924,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_comp
compiler->mode32 = 0;
#endif
- FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw));
+ FAIL_IF(emit_groupf(compiler, CVTTSD2SI_r_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP2, dst_r, src, srcw));
if (dst & SLJIT_MEM)
return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
@@ -2950,7 +2954,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_comp
srcw = 0;
}
- FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm, EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw));
+ FAIL_IF(emit_groupf(compiler, CVTSI2SD_x_rm | EX86_SELECT_F2_F3(op) | EX86_SSE2_OP1, dst_r, src, srcw));
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
compiler->mode32 = 1;
@@ -2968,7 +2972,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
case SLJIT_ORDERED_EQUAL:
/* Also: SLJIT_UNORDERED_OR_NOT_EQUAL */
FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src1, src1w));
- FAIL_IF(emit_groupf(compiler, CMPS_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, CMPS_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, TMP_FREG, src2, src2w));
/* EQ */
FAIL_IF(emit_byte(compiler, 0));
@@ -2986,7 +2990,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
src2 = TMP_FREG;
}
- return emit_groupf(compiler, UCOMISD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w);
+ return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src2, src1, src1w);
}
if (!FAST_IS_REG(src1)) {
@@ -2994,7 +2998,7 @@ static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compile
src1 = TMP_FREG;
}
- return emit_groupf(compiler, UCOMISD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w);
+ return emit_groupf(compiler, UCOMISD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, src1, src2, src2w);
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
@@ -3002,6 +3006,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil
sljit_s32 src, sljit_sw srcw)
{
sljit_s32 dst_r;
+ sljit_u8 *inst;
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
compiler->mode32 = 1;
@@ -3025,41 +3030,57 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compil
/* We overwrite the high bits of source. From SLJIT point of view,
this is not an issue.
Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
- FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm, ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0));
+ FAIL_IF(emit_groupf(compiler, UNPCKLPD_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, src, src, 0));
} else {
FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_32), TMP_FREG, src, srcw));
src = TMP_FREG;
}
- FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm, ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0));
+ FAIL_IF(emit_groupf(compiler, CVTPD2PS_x_xm | ((op & SLJIT_32) ? EX86_PREF_66 : 0) | EX86_SSE2, dst_r, src, 0));
if (dst_r == TMP_FREG)
return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
return SLJIT_SUCCESS;
}
if (FAST_IS_REG(dst)) {
- dst_r = dst;
- if (dst != src)
- FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src, srcw));
- }
- else {
- dst_r = TMP_FREG;
- FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_r, src, srcw));
+ dst_r = (dst == src) ? TMP_FREG : dst;
+
+ if (src & SLJIT_MEM)
+ FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
+
+ FAIL_IF(emit_groupf(compiler, PCMPEQD_x_xm | EX86_PREF_66 | EX86_SSE2, dst_r, dst_r, 0));
+
+ inst = emit_x86_instruction(compiler, 2 | EX86_PREF_66 | EX86_SSE2_OP2, 0, 0, dst_r, 0);
+ inst[0] = GROUP_0F;
+ /* Same as PSRLD_x / PSRLQ_x */
+ inst[1] = (op & SLJIT_32) ? PSLLD_x_i8 : PSLLQ_x_i8;
+
+ if (GET_OPCODE(op) == SLJIT_ABS_F64) {
+ inst[2] |= 2 << 3;
+ FAIL_IF(emit_byte(compiler, 1));
+ } else {
+ inst[2] |= 6 << 3;
+ FAIL_IF(emit_byte(compiler, ((op & SLJIT_32) ? 31 : 63)));
+ }
+
+ if (dst_r != TMP_FREG)
+ dst_r = (src & SLJIT_MEM) ? TMP_FREG : src;
+ return emit_groupf(compiler, (GET_OPCODE(op) == SLJIT_NEG_F64 ? XORPD_x_xm : ANDPD_x_xm) | EX86_SSE2, dst, dst_r, 0);
}
+ FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src, srcw));
+
switch (GET_OPCODE(op)) {
case SLJIT_NEG_F64:
- FAIL_IF(emit_groupf(compiler, XORPD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
+ FAIL_IF(emit_groupf(compiler, XORPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
break;
case SLJIT_ABS_F64:
- FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, EX86_SELECT_66(op) | EX86_SSE2, dst_r, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
+ FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | EX86_SELECT_66(op) | EX86_SSE2, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer + 4 : sse2_buffer + 12)));
break;
}
- if (dst_r == TMP_FREG)
- return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
- return SLJIT_SUCCESS;
+ return emit_sse2_store(compiler, op & SLJIT_32, dst, dstw, TMP_FREG);
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
@@ -3102,19 +3123,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compil
switch (GET_OPCODE(op)) {
case SLJIT_ADD_F64:
- FAIL_IF(emit_groupf(compiler, ADDSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, ADDSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
break;
case SLJIT_SUB_F64:
- FAIL_IF(emit_groupf(compiler, SUBSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, SUBSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
break;
case SLJIT_MUL_F64:
- FAIL_IF(emit_groupf(compiler, MULSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, MULSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
break;
case SLJIT_DIV_F64:
- FAIL_IF(emit_groupf(compiler, DIVSD_x_xm, EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
+ FAIL_IF(emit_groupf(compiler, DIVSD_x_xm | EX86_SELECT_F2_F3(op) | EX86_SSE2, dst_r, src2, src2w));
break;
}
@@ -3142,9 +3163,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compi
if (dst_freg == src1) {
FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, TMP_FREG, src2, src2w));
pref = EX86_SELECT_66(op) | EX86_SSE2;
- FAIL_IF(emit_groupf(compiler, XORPD_x_xm, pref, TMP_FREG, src1, src1w));
- FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
- return emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, TMP_FREG, 0);
+ FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, TMP_FREG, src1, src1w));
+ FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, TMP_FREG, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
+ return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, TMP_FREG, 0);
}
if (src1 & SLJIT_MEM) {
@@ -3157,9 +3178,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2r(struct sljit_compiler *compi
FAIL_IF(emit_sse2_load(compiler, op & SLJIT_32, dst_freg, src2, src2w));
pref = EX86_SELECT_66(op) | EX86_SSE2;
- FAIL_IF(emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, src1, src1w));
- FAIL_IF(emit_groupf(compiler, ANDPD_x_xm, pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
- return emit_groupf(compiler, XORPD_x_xm, pref, dst_freg, src1, src1w);
+ FAIL_IF(emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w));
+ FAIL_IF(emit_groupf(compiler, ANDPD_x_xm | pref, dst_freg, SLJIT_MEM0(), (sljit_sw)((op & SLJIT_32) ? sse2_buffer : sse2_buffer + 8)));
+ return emit_groupf(compiler, XORPD_x_xm | pref, dst_freg, src1, src1w);
}
/* --------------------------------------------------------------------- */
@@ -3445,7 +3466,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_select(struct sljit_compiler *comp
#endif /* SLJIT_CONFIG_X86_32 */
if (sljit_has_cpu_feature(SLJIT_HAS_CMOV))
- FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), 0, dst_reg, src1, src1w));
+ FAIL_IF(emit_groupf(compiler, U8(get_jump_code((sljit_uw)type) - 0x40), dst_reg, src1, src1w));
else
FAIL_IF(emit_cmov_generic(compiler, type, dst_reg, src1, src1w));
@@ -3500,9 +3521,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
{
sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
- sljit_s32 alignment = SLJIT_SIMD_GET_ALIGNMENT(type);
- sljit_u8 opcode = 0;
- sljit_uw pref;
+ sljit_s32 alignment = SLJIT_SIMD_GET_ELEM2_SIZE(type);
+ sljit_uw op;
CHECK_ERROR();
CHECK(check_sljit_emit_simd_mov(compiler, type, freg, srcdst, srcdstw));
@@ -3515,12 +3535,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
switch (reg_size) {
case 4:
- pref = 2 | EX86_SSE2;
+ op = EX86_SSE2;
break;
case 5:
if (!(cpu_feature_list & CPU_FEATURE_AVX2))
return SLJIT_ERR_UNSUPPORTED;
- pref = EX86_SSE2 | VEX_256;
+ op = EX86_SSE2 | VEX_256;
break;
default:
return SLJIT_ERR_UNSUPPORTED;
@@ -3531,29 +3551,27 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *co
if (type & SLJIT_SIMD_FLOAT) {
if (elem_size == 2 || elem_size == 3) {
- opcode = alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm;
+ op |= alignment >= reg_size ? MOVAPS_x_xm : MOVUPS_x_xm;
if (elem_size == 3)
- pref |= EX86_PREF_66;
+ op |= EX86_PREF_66;
if (type & SLJIT_SIMD_STORE)
- opcode = U8(opcode + 1);
- }
+ op += 1;
+ } else
+ return SLJIT_ERR_UNSUPPORTED;
} else {
- opcode = (type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm;
- pref |= alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3;
+ op |= ((type & SLJIT_SIMD_STORE) ? MOVDQA_xm_x : MOVDQA_x_xm)
+ | (alignment >= reg_size ? EX86_PREF_66 : EX86_PREF_F3);
}
- if (opcode == 0)
- return SLJIT_ERR_UNSUPPORTED;
-
if (type & SLJIT_SIMD_TEST)
return SLJIT_SUCCESS;
- if (pref & VEX_256)
- return emit_vex_instruction(compiler, opcode | pref, freg, 0, srcdst, srcdstw);
+ if (op & VEX_256)
+ return emit_vex_instruction(compiler, op, freg, 0, srcdst, srcdstw);
- return emit_groupf(compiler, opcode, pref, freg, srcdst, srcdstw);
+ return emit_groupf(compiler, op, freg, srcdst, srcdstw);
}
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compiler *compiler, sljit_s32 type,
@@ -3598,7 +3616,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
if (elem_size >= 3)
compiler->mode32 = 0;
#endif /* SLJIT_CONFIG_X86_64 */
- FAIL_IF(emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw));
+ FAIL_IF(emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, src, srcw));
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
compiler->mode32 = 1;
#endif /* SLJIT_CONFIG_X86_64 */
@@ -3641,7 +3659,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
if (reg_size == 5)
return emit_vex_instruction(compiler, XORPD_x_xm | VEX_256 | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
- return emit_groupf(compiler, XORPD_x_xm, (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0);
+ return emit_groupf(compiler, XORPD_x_xm | (elem_size == 3 ? EX86_PREF_66 : 0) | EX86_SSE2, freg, freg, 0);
}
if (elem_size == 2 && freg != src) {
@@ -3650,7 +3668,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
srcw = 0;
}
- FAIL_IF(emit_groupf(compiler, elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm, (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw));
+ FAIL_IF(emit_groupf(compiler, (elem_size == 2 ? SHUFPS_x_xm : MOVDDUP_x_xm) | (elem_size == 2 ? 0 : EX86_PREF_F2) | EX86_SSE2, freg, src, srcw));
if (elem_size == 2)
return emit_byte(compiler, 0);
@@ -3676,9 +3694,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
if (srcw == 0 || srcw == -1) {
if (reg_size == 5)
- return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
+ return emit_vex_instruction(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0);
- return emit_groupf(compiler, srcw == 0 ? PXOR_x_xm : PCMPEQB_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
+ return emit_groupf(compiler, (srcw == 0 ? PXOR_x_xm : PCMPEQD_x_xm) | EX86_PREF_66 | EX86_SSE2, freg, freg, 0);
}
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
@@ -3741,19 +3759,19 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_replicate(struct sljit_compil
switch (elem_size) {
case 0:
- FAIL_IF(emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
- return emit_groupf_ext(compiler, PSHUFB_x_xm, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
+ FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
+ return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
case 1:
- FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, EX86_PREF_F2 | EX86_SSE2, freg, freg, 0));
+ FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, freg, 0));
FAIL_IF(emit_byte(compiler, 0));
/* fallthrough */
default:
- FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
+ FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
return emit_byte(compiler, 0);
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
case 3:
compiler->mode32 = 1;
- FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
+ FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, freg, 0));
return emit_byte(compiler, 0x44);
#endif /* SLJIT_CONFIG_X86_64 */
}
@@ -3835,18 +3853,18 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
if (elem_size == 2) {
if (reg_size == 4)
- return emit_groupf(compiler, MOVD_x_rm, EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw);
+ return emit_groupf(compiler, MOVD_x_rm | EX86_PREF_66 | EX86_SSE2_OP1, freg, srcdst, srcdstw);
return emit_vex_instruction(compiler, MOVD_x_rm | VEX_AUTO_W | EX86_PREF_66 | EX86_SSE2_OP1, freg, 0, srcdst, srcdstw);
}
} else if (srcdst & SLJIT_MEM) {
SLJIT_ASSERT(elem_size == 2 || elem_size == 3);
if (reg_size == 4)
- return emit_groupf(compiler, MOVSD_x_xm, (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw);
+ return emit_groupf(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, srcdst, srcdstw);
return emit_vex_instruction(compiler, MOVSD_x_xm | (elem_size == 2 ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, freg, 0, srcdst, srcdstw);
} else if (elem_size == 3) {
if (reg_size == 4)
- return emit_groupf(compiler, MOVQ_x_xm, EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0);
+ return emit_groupf(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, srcdst, 0);
return emit_vex_instruction(compiler, MOVQ_x_xm | EX86_PREF_F3 | EX86_SSE2, freg, 0, srcdst, 0);
}
}
@@ -3860,12 +3878,13 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
srcdstw = 0;
}
- size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0);
+ size = ((!(type & SLJIT_SIMD_FLOAT) || elem_size != 2) ? EX86_PREF_66 : 0)
+ | ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | EX86_SSE2;
if (reg_size == 5)
- FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm) | VEX_256 | size | EX86_SSE2 | VEX_SSE2_OPV, freg, freg, freg, 0));
+ FAIL_IF(emit_vex_instruction(compiler, size | VEX_256 | VEX_SSE2_OPV, freg, freg, freg, 0));
else
- FAIL_IF(emit_groupf(compiler, (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm, size | EX86_SSE2, freg, freg, 0));
+ FAIL_IF(emit_groupf(compiler, size, freg, freg, 0));
} else if (reg_size == 5 && lane_index >= (1 << (4 - elem_size))) {
FAIL_IF(emit_vex_instruction(compiler, ((type & SLJIT_SIMD_FLOAT) ? VEXTRACTF128_x_ym : VEXTRACTI128_x_ym) | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
FAIL_IF(emit_byte(compiler, 1));
@@ -3878,20 +3897,20 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
if (elem_size == 3) {
if (srcdst & SLJIT_MEM) {
if (type & SLJIT_SIMD_STORE)
- opcode = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
+ size = lane_index == 0 ? MOVLPD_m_x : MOVHPD_m_x;
else
- opcode = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
+ size = lane_index == 0 ? MOVLPD_x_m : MOVHPD_x_m;
- FAIL_IF(emit_groupf(compiler, opcode, EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw));
+ FAIL_IF(emit_groupf(compiler, size | EX86_PREF_66 | EX86_SSE2, freg, srcdst, srcdstw));
/* In case of store, freg is not TMP_FREG. */
} else if (type & SLJIT_SIMD_STORE) {
if (lane_index == 1)
- return emit_groupf(compiler, MOVHLPS_x_x, EX86_SSE2, srcdst, freg, 0);
+ return emit_groupf(compiler, MOVHLPS_x_x | EX86_SSE2, srcdst, freg, 0);
return emit_sse2_load(compiler, 0, srcdst, freg, 0);
} else {
if (lane_index == 1)
- FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x, EX86_SSE2, freg, srcdst, 0));
+ FAIL_IF(emit_groupf(compiler, MOVLHPS_x_x | EX86_SSE2, freg, srcdst, 0));
else
FAIL_IF(emit_sse2_store(compiler, 0, freg, 0, srcdst));
}
@@ -3900,13 +3919,12 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
return emit_sse2_store(compiler, 1, srcdst, srcdstw, freg);
if (srcdst & SLJIT_MEM) {
- FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
+ FAIL_IF(emit_groupf_ext(compiler, EXTRACTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
return emit_byte(compiler, U8(lane_index));
}
- size = EX86_SSE2;
if (srcdst == freg)
- opcode = SHUFPS_x_xm;
+ size = SHUFPS_x_xm | EX86_SSE2;
else {
if (cpu_feature_list & CPU_FEATURE_AVX) {
FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | EX86_SSE2 | VEX_SSE2_OPV, srcdst, freg, freg, 0));
@@ -3915,29 +3933,28 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
switch (lane_index) {
case 1:
- opcode = MOVSHDUP_x_xm;
- size = EX86_PREF_F3 | EX86_SSE2;
+ size = MOVSHDUP_x_xm | EX86_PREF_F3 | EX86_SSE2;
break;
case 2:
- opcode = MOVHLPS_x_x;
+ size = MOVHLPS_x_x | EX86_SSE2;
break;
default:
SLJIT_ASSERT(lane_index == 3);
- opcode = PSHUFD_x_xm;
- size = EX86_PREF_66 | EX86_SSE2;
+ size = PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2;
break;
}
}
- FAIL_IF(emit_groupf(compiler, opcode, size, srcdst, freg, 0));
+ FAIL_IF(emit_groupf(compiler, size, srcdst, freg, 0));
- if (opcode == SHUFPS_x_xm || opcode == PSHUFD_x_xm)
+ size &= 0xff;
+ if (size == SHUFPS_x_xm || size == PSHUFD_x_xm)
return emit_byte(compiler, U8(lane_index));
return SLJIT_SUCCESS;
} else {
if (lane_index != 0 || (srcdst & SLJIT_MEM)) {
- FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm, EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
+ FAIL_IF(emit_groupf_ext(compiler, INSERTPS_x_xm | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, srcdst, srcdstw));
FAIL_IF(emit_byte(compiler, U8(lane_index << 4)));
} else
FAIL_IF(emit_sse2_store(compiler, 1, freg, 0, srcdst));
@@ -4051,7 +4068,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compile
if (elem_size >= 2)
return SLJIT_SUCCESS;
- FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16, 0,
+ FAIL_IF(emit_groupf(compiler, (elem_size == 0) ? MOVSX_r_rm8 : MOVSX_r_rm16,
(srcdst_orig != 0 && FAST_IS_REG(srcdst_orig)) ? srcdst_orig : srcdst, srcdst, 0));
if (srcdst_orig & SLJIT_MEM)
@@ -4107,7 +4124,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
}
if (src_lane_index == 0)
- return emit_groupf(compiler, MOVDDUP_x_xm, EX86_PREF_F2 | EX86_SSE2, freg, src, 0);
+ return emit_groupf(compiler, MOVDDUP_x_xm | EX86_PREF_F2 | EX86_SSE2, freg, src, 0);
/* Changes it to SHUFPD_x_xm. */
pref = EX86_PREF_66;
@@ -4137,9 +4154,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
FAIL_IF(emit_vex_instruction(compiler, SHUFPS_x_xm | pref | EX86_SSE2 | VEX_SSE2_OPV, freg, src, src, 0));
} else {
if (freg != src)
- FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm, pref | EX86_SSE2, freg, src, 0));
+ FAIL_IF(emit_groupf(compiler, MOVAPS_x_xm | pref | EX86_SSE2, freg, src, 0));
- FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm, pref | EX86_SSE2, freg, freg, 0));
+ FAIL_IF(emit_groupf(compiler, SHUFPS_x_xm | pref | EX86_SSE2, freg, freg, 0));
}
if (elem_size == 2) {
@@ -4174,9 +4191,9 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
} else {
if (freg == src || !(cpu_feature_list & CPU_FEATURE_AVX2)) {
if (freg != src)
- FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm, EX86_PREF_66 | EX86_SSE2, freg, src, 0));
+ FAIL_IF(emit_groupf(compiler, MOVDQA_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
- FAIL_IF(emit_groupf(compiler, PSRLDQ_x, 2 | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0));
+ FAIL_IF(emit_groupf(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2, opcode3, freg, 0));
} else
FAIL_IF(emit_vex_instruction(compiler, PSRLDQ_x | EX86_PREF_66 | EX86_SSE2_OP2 | VEX_SSE2_OPV, opcode3, freg, src, 0));
@@ -4184,7 +4201,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
}
if (pref != 0) {
- FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, pref | EX86_SSE2, freg, src, 0));
+ FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
FAIL_IF(emit_byte(compiler, byte));
}
@@ -4195,8 +4212,8 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
return emit_vex_instruction(compiler, VPBROADCASTB_x_xm | (reg_size == 5 ? VEX_256 : 0) | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, 0);
SLJIT_ASSERT(reg_size == 4);
- FAIL_IF(emit_groupf(compiler, PXOR_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
- return emit_groupf_ext(compiler, PSHUFB_x_xm, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
+ FAIL_IF(emit_groupf(compiler, PXOR_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, TMP_FREG, 0));
+ return emit_groupf_ext(compiler, PSHUFB_x_xm | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, TMP_FREG, 0);
}
if ((cpu_feature_list & CPU_FEATURE_AVX2) && src_lane_index == 0 && elem_size <= 3) {
@@ -4261,7 +4278,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
src_lane_index >>= 1;
pref = (src_lane_index & 2) == 0 ? EX86_PREF_F2 : EX86_PREF_F3;
- FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm, pref | EX86_SSE2, freg, src, 0));
+ FAIL_IF(emit_groupf(compiler, PSHUFLW_x_xm | pref | EX86_SSE2, freg, src, 0));
byte = U8(byte | (byte << 2));
FAIL_IF(emit_byte(compiler, U8(byte | (byte << 4))));
@@ -4280,7 +4297,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_c
break;
}
- FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm, EX86_PREF_66 | EX86_SSE2, freg, src, 0));
+ FAIL_IF(emit_groupf(compiler, PSHUFD_x_xm | EX86_PREF_66 | EX86_SSE2, freg, src, 0));
return emit_byte(compiler, U8(byte | (byte << 4)));
}
@@ -4316,7 +4333,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler
return SLJIT_SUCCESS;
if (reg_size == 4)
- return emit_groupf(compiler, CVTPS2PD_x_xm, EX86_SSE2, freg, src, srcw);
+ return emit_groupf(compiler, CVTPS2PD_x_xm | EX86_SSE2, freg, src, srcw);
return emit_vex_instruction(compiler, CVTPS2PD_x_xm | VEX_256 | EX86_SSE2, freg, 0, src, srcw);
}
@@ -4353,7 +4370,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_extend(struct sljit_compiler
return SLJIT_SUCCESS;
if (reg_size == 4)
- return emit_groupf_ext(compiler, opcode, EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw);
+ return emit_groupf_ext(compiler, opcode | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, src, srcw);
return emit_vex_instruction(compiler, opcode | VEX_256 | EX86_PREF_66 | VEX_OP_0F38 | EX86_SSE2, freg, 0, src, srcw);
}
@@ -4388,7 +4405,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
switch (elem_size) {
case 1:
- FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm, EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0));
+ FAIL_IF(emit_groupf(compiler, PACKSSWB_x_xm | EX86_PREF_66 | EX86_SSE2, TMP_FREG, freg, 0));
freg = TMP_FREG;
break;
case 2:
@@ -4397,7 +4414,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
}
dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
- FAIL_IF(emit_groupf(compiler, elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x, pref, dst_r, freg, 0));
+ FAIL_IF(emit_groupf(compiler, (elem_size < 2 ? PMOVMSKB_r_x : MOVMSKPS_r_x) | pref, dst_r, freg, 0));
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
compiler->mode32 = type & SLJIT_32;
@@ -4427,7 +4444,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
FAIL_IF(emit_vex_instruction(compiler, VEXTRACTI128_x_ym | VEX_256 | EX86_PREF_66 | VEX_OP_0F3A | EX86_SSE2, freg, 0, TMP_FREG, 0));
FAIL_IF(emit_byte(compiler, 1));
FAIL_IF(emit_vex_instruction(compiler, PACKSSWB_x_xm | VEX_256 | EX86_PREF_66 | EX86_SSE2 | VEX_SSE2_OPV, TMP_FREG, freg, TMP_FREG, 0));
- FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x, EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0));
+ FAIL_IF(emit_groupf(compiler, PMOVMSKB_r_x | EX86_PREF_66 | EX86_SSE2_OP2, dst_r, TMP_FREG, 0));
} else {
pref = MOVMSKPS_r_x | VEX_256 | EX86_SSE2_OP2;
@@ -4449,6 +4466,87 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *c
return SLJIT_SUCCESS;
}
+static sljit_s32 emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src_freg)
+{
+ sljit_uw op = ((type & SLJIT_SIMD_FLOAT) ? MOVAPS_x_xm : MOVDQA_x_xm) | EX86_SSE2;
+
+ SLJIT_ASSERT(SLJIT_SIMD_GET_REG_SIZE(type) == 4);
+
+ if (!(type & SLJIT_SIMD_FLOAT) || SLJIT_SIMD_GET_ELEM_SIZE(type) == 3)
+ op |= EX86_PREF_66;
+
+ return emit_groupf(compiler, op, dst_freg, src_freg, 0);
+}
+
+SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type,
+ sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
+{
+ sljit_s32 reg_size = SLJIT_SIMD_GET_REG_SIZE(type);
+ sljit_s32 elem_size = SLJIT_SIMD_GET_ELEM_SIZE(type);
+ sljit_s32 needs_move = 0;
+ sljit_uw op = 0;
+
+ CHECK_ERROR();
+ CHECK(check_sljit_emit_simd_op2(compiler, type, dst_freg, src1_freg, src2_freg));
+
+#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
+ compiler->mode32 = 1;
+#endif /* SLJIT_CONFIG_X86_64 */
+
+ if (reg_size == 5) {
+ if (!(cpu_feature_list & CPU_FEATURE_AVX2))
+ return SLJIT_ERR_UNSUPPORTED;
+ } else if (reg_size != 4)
+ return SLJIT_ERR_UNSUPPORTED;
+
+ if ((type & SLJIT_SIMD_FLOAT) && (elem_size < 2 || elem_size > 3))
+ return SLJIT_ERR_UNSUPPORTED;
+
+ switch (SLJIT_SIMD_GET_OPCODE(type)) {
+ case SLJIT_SIMD_OP2_AND:
+ op = (type & SLJIT_SIMD_FLOAT) ? ANDPD_x_xm : PAND_x_xm;
+
+ if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
+ op |= EX86_PREF_66;
+ break;
+ case SLJIT_SIMD_OP2_OR:
+ op = (type & SLJIT_SIMD_FLOAT) ? ORPD_x_xm : POR_x_xm;
+
+ if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
+ op |= EX86_PREF_66;
+ break;
+ case SLJIT_SIMD_OP2_XOR:
+ op = (type & SLJIT_SIMD_FLOAT) ? XORPD_x_xm : PXOR_x_xm;
+
+ if (!(type & SLJIT_SIMD_FLOAT) || elem_size == 3)
+ op |= EX86_PREF_66;
+ break;
+ }
+
+ if (type & SLJIT_SIMD_TEST)
+ return SLJIT_SUCCESS;
+
+ needs_move = dst_freg != src1_freg && dst_freg != src2_freg;
+
+ if (reg_size == 5 || (needs_move && (cpu_feature_list & CPU_FEATURE_AVX2))) {
+ if (reg_size == 5)
+ op |= VEX_256;
+
+ return emit_vex_instruction(compiler, op | EX86_SSE2 | VEX_SSE2_OPV, dst_freg, src1_freg, src2_freg, 0);
+ }
+
+ if (needs_move) {
+ FAIL_IF(emit_simd_mov(compiler, type, dst_freg, src1_freg));
+ } else if (dst_freg != src1_freg) {
+ SLJIT_ASSERT(dst_freg == src2_freg);
+ src2_freg = src1_freg;
+ }
+
+ FAIL_IF(emit_groupf(compiler, op | EX86_SSE2, dst_freg, src2_freg, 0));
+ return SLJIT_SUCCESS;
+}
+
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_load(struct sljit_compiler *compiler, sljit_s32 op,
sljit_s32 dst_reg,
sljit_s32 mem_reg)
@@ -4541,7 +4639,7 @@ SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_atomic_store(struct sljit_compiler
pref = EX86_REX;
#endif /* SLJIT_CONFIG_X86_64 */
- FAIL_IF(emit_groupf(compiler, U8(op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r), pref, src_reg, SLJIT_MEM1(mem_reg), 0));
+ FAIL_IF(emit_groupf(compiler, (op == SLJIT_MOV_U8 ? CMPXCHG_rm8_r : CMPXCHG_rm_r) | pref, src_reg, SLJIT_MEM1(mem_reg), 0));
if (temp_reg != SLJIT_R0) {
#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)