aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/x86/vm
diff options
context:
space:
mode:
authorDenis S. Fokin <Denis.Fokin@gmail.com>2015-10-29 13:33:55 +0300
committerDenis S. Fokin <Denis.Fokin@gmail.com>2015-10-29 13:33:55 +0300
commit24a09c463b2d328b6fc90ee555023514efca507c (patch)
tree9323dfc9d767b6af41ec7ed408cc32eb5775b93e /src/cpu/x86/vm
parent07ad54003f167bf4f38b23fdc8a677a37dc91ea9 (diff)
parentb6bff51db351a645334f3b9097637e0e0a76dbed (diff)
downloadjdk8u_hotspot-24a09c463b2d328b6fc90ee555023514efca507c.tar.gz
Merge with default before merge with jdk8u60
--HG-- branch : 8u40-verified-fixes
Diffstat (limited to 'src/cpu/x86/vm')
-rw-r--r--src/cpu/x86/vm/assembler_x86.cpp62
-rw-r--r--src/cpu/x86/vm/assembler_x86.hpp25
-rw-r--r--src/cpu/x86/vm/c1_LIRGenerator_x86.cpp9
-rw-r--r--src/cpu/x86/vm/c1_Runtime1_x86.cpp4
-rw-r--r--src/cpu/x86/vm/globals_x86.hpp6
-rw-r--r--src/cpu/x86/vm/macroAssembler_x86.cpp463
-rw-r--r--src/cpu/x86/vm/macroAssembler_x86.hpp22
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_64.cpp69
-rw-r--r--src/cpu/x86/vm/vm_version_x86.cpp67
-rw-r--r--src/cpu/x86/vm/vm_version_x86.hpp19
10 files changed, 719 insertions, 27 deletions
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 342192775..8098e889b 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -4937,6 +4937,26 @@ void Assembler::addq(Register dst, Register src) {
emit_arith(0x03, 0xC0, dst, src);
}
+void Assembler::adcxq(Register dst, Register src) {
+ //assert(VM_Version::supports_adx(), "adx instructions not supported");
+ emit_int8((unsigned char)0x66);
+ int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+ emit_int8(0x0F);
+ emit_int8(0x38);
+ emit_int8((unsigned char)0xF6);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::adoxq(Register dst, Register src) {
+ //assert(VM_Version::supports_adx(), "adx instructions not supported");
+ emit_int8((unsigned char)0xF3);
+ int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+ emit_int8(0x0F);
+ emit_int8(0x38);
+ emit_int8((unsigned char)0xF6);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::andq(Address dst, int32_t imm32) {
InstructionMark im(this);
prefixq(dst);
@@ -5444,6 +5464,26 @@ void Assembler::movzwq(Register dst, Register src) {
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::mulq(Address src) {
+ InstructionMark im(this);
+ prefixq(src);
+ emit_int8((unsigned char)0xF7);
+ emit_operand(rsp, src);
+}
+
+void Assembler::mulq(Register src) {
+ int encode = prefixq_and_encode(src->encoding());
+ emit_int8((unsigned char)0xF7);
+ emit_int8((unsigned char)(0xE0 | encode));
+}
+
+void Assembler::mulxq(Register dst1, Register dst2, Register src) {
+ assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
+ int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false);
+ emit_int8((unsigned char)0xF6);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::negq(Register dst) {
int encode = prefixq_and_encode(dst->encoding());
emit_int8((unsigned char)0xF7);
@@ -5572,6 +5612,28 @@ void Assembler::rclq(Register dst, int imm8) {
emit_int8(imm8);
}
}
+
+void Assembler::rorq(Register dst, int imm8) {
+ assert(isShiftCount(imm8 >> 1), "illegal shift count");
+ int encode = prefixq_and_encode(dst->encoding());
+ if (imm8 == 1) {
+ emit_int8((unsigned char)0xD1);
+ emit_int8((unsigned char)(0xC8 | encode));
+ } else {
+ emit_int8((unsigned char)0xC1);
+ emit_int8((unsigned char)(0xc8 | encode));
+ emit_int8(imm8);
+ }
+}
+
+void Assembler::rorxq(Register dst, Register src, int imm8) {
+ assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false);
+ emit_int8((unsigned char)0xF0);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(imm8);
+}
+
void Assembler::sarq(Register dst, int imm8) {
assert(isShiftCount(imm8 >> 1), "illegal shift count");
int encode = prefixq_and_encode(dst->encoding());
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index 8edf31cad..2ac9df8c9 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -888,6 +888,14 @@ private:
void addq(Register dst, Address src);
void addq(Register dst, Register src);
+#ifdef _LP64
+ //Add Unsigned Integers with Carry Flag
+ void adcxq(Register dst, Register src);
+
+ //Add Unsigned Integers with Overflow Flag
+ void adoxq(Register dst, Register src);
+#endif
+
void addr_nop_4();
void addr_nop_5();
void addr_nop_7();
@@ -1204,19 +1212,20 @@ private:
void idivl(Register src);
void divl(Register src); // Unsigned division
+#ifdef _LP64
void idivq(Register src);
+#endif
void imull(Register dst, Register src);
void imull(Register dst, Register src, int value);
void imull(Register dst, Address src);
+#ifdef _LP64
void imulq(Register dst, Register src);
void imulq(Register dst, Register src, int value);
-#ifdef _LP64
void imulq(Register dst, Address src);
#endif
-
// jcc is the generic conditional branch generator to run-
// time routines, jcc is used for branches to labels. jcc
// takes a branch opcode (cc) and a label (L) and generates
@@ -1408,9 +1417,16 @@ private:
void movzwq(Register dst, Register src);
#endif
+ // Unsigned multiply with RAX destination register
void mull(Address src);
void mull(Register src);
+#ifdef _LP64
+ void mulq(Address src);
+ void mulq(Register src);
+ void mulxq(Register dst1, Register dst2, Register src);
+#endif
+
// Multiply Scalar Double-Precision Floating-Point Values
void mulsd(XMMRegister dst, Address src);
void mulsd(XMMRegister dst, XMMRegister src);
@@ -1541,6 +1557,11 @@ private:
void ret(int imm16);
+#ifdef _LP64
+ void rorq(Register dst, int imm8);
+ void rorxq(Register dst, Register src, int imm8);
+#endif
+
void sahf();
void sarl(Register dst, int imm8);
diff --git a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
index b4e8223e1..1ee690ea4 100644
--- a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@@ -1085,14 +1085,11 @@ void LIRGenerator::do_Convert(Convert* x) {
void LIRGenerator::do_NewInstance(NewInstance* x) {
-#ifndef PRODUCT
- if (PrintNotLoaded && !x->klass()->is_loaded()) {
- tty->print_cr(" ###class not loaded at new bci %d", x->printable_bci());
- }
-#endif
+ print_if_not_loaded(x);
+
CodeEmitInfo* info = state_for(x, x->state());
LIR_Opr reg = result_register_for(x->type());
- new_instance(reg, x->klass(),
+ new_instance(reg, x->klass(), x->is_unresolved(),
FrameMap::rcx_oop_opr,
FrameMap::rdi_oop_opr,
FrameMap::rsi_oop_opr,
diff --git a/src/cpu/x86/vm/c1_Runtime1_x86.cpp b/src/cpu/x86/vm/c1_Runtime1_x86.cpp
index fd6302d21..76303c114 100644
--- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp
+++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp
@@ -675,7 +675,7 @@ OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler *sasm) {
case handle_exception_nofpu_id:
case handle_exception_id:
// At this point all registers MAY be live.
- oop_map = save_live_registers(sasm, 1 /*thread*/, id == handle_exception_nofpu_id);
+ oop_map = save_live_registers(sasm, 1 /*thread*/, id != handle_exception_nofpu_id);
break;
case handle_exception_from_callee_id: {
// At this point all registers except exception oop (RAX) and
@@ -748,7 +748,7 @@ OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler *sasm) {
case handle_exception_nofpu_id:
case handle_exception_id:
// Restore the registers that were saved at the beginning.
- restore_live_registers(sasm, id == handle_exception_nofpu_id);
+ restore_live_registers(sasm, id != handle_exception_nofpu_id);
break;
case handle_exception_from_callee_id:
// WIN64_ONLY: No need to add frame::arg_reg_save_area_bytes to SP
diff --git a/src/cpu/x86/vm/globals_x86.hpp b/src/cpu/x86/vm/globals_x86.hpp
index 5b34293ea..1401997b3 100644
--- a/src/cpu/x86/vm/globals_x86.hpp
+++ b/src/cpu/x86/vm/globals_x86.hpp
@@ -176,6 +176,8 @@ define_pd_global(uintx, TypeProfileLevel, 111);
"Use count trailing zeros instruction") \
\
product(bool, UseBMI1Instructions, false, \
- "Use BMI instructions")
-
+ "Use BMI1 instructions") \
+ \
+ product(bool, UseBMI2Instructions, false, \
+ "Use BMI2 instructions")
#endif // CPU_X86_VM_GLOBALS_X86_HPP
diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp
index 7216c1980..5857a9350 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp
@@ -1769,7 +1769,7 @@ void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg
// at [FETCH], below, will never observe a biased encoding (*101b).
// If this invariant is not held we risk exclusion (safety) failure.
if (UseBiasedLocking && !UseOptoBiasInlining) {
- biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
+ biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
}
#if INCLUDE_RTM_OPT
@@ -7293,6 +7293,467 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
bind(L_done);
}
+#ifdef _LP64
+/**
+ * Helper for multiply_to_len().
+ */
+void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
+ addq(dest_lo, src1);
+ adcq(dest_hi, 0);
+ addq(dest_lo, src2);
+ adcq(dest_hi, 0);
+}
+
+/**
+ * Multiply 64 bit by 64 bit first loop.
+ */
+void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+ Register y, Register y_idx, Register z,
+ Register carry, Register product,
+ Register idx, Register kdx) {
+ //
+ // jlong carry, x[], y[], z[];
+ // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+ // huge_128 product = y[idx] * x[xstart] + carry;
+ // z[kdx] = (jlong)product;
+ // carry = (jlong)(product >>> 64);
+ // }
+ // z[xstart] = carry;
+ //
+
+ Label L_first_loop, L_first_loop_exit;
+ Label L_one_x, L_one_y, L_multiply;
+
+ decrementl(xstart);
+ jcc(Assembler::negative, L_one_x);
+
+ movq(x_xstart, Address(x, xstart, Address::times_4, 0));
+ rorq(x_xstart, 32); // convert big-endian to little-endian
+
+ bind(L_first_loop);
+ decrementl(idx);
+ jcc(Assembler::negative, L_first_loop_exit);
+ decrementl(idx);
+ jcc(Assembler::negative, L_one_y);
+ movq(y_idx, Address(y, idx, Address::times_4, 0));
+ rorq(y_idx, 32); // convert big-endian to little-endian
+ bind(L_multiply);
+ movq(product, x_xstart);
+ mulq(y_idx); // product(rax) * y_idx -> rdx:rax
+ addq(product, carry);
+ adcq(rdx, 0);
+ subl(kdx, 2);
+ movl(Address(z, kdx, Address::times_4, 4), product);
+ shrq(product, 32);
+ movl(Address(z, kdx, Address::times_4, 0), product);
+ movq(carry, rdx);
+ jmp(L_first_loop);
+
+ bind(L_one_y);
+ movl(y_idx, Address(y, 0));
+ jmp(L_multiply);
+
+ bind(L_one_x);
+ movl(x_xstart, Address(x, 0));
+ jmp(L_first_loop);
+
+ bind(L_first_loop_exit);
+}
+
+/**
+ * Multiply 64 bit by 64 bit and add 128 bit.
+ */
+void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx,
+ Register carry, Register product, int offset) {
+ // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
+ // z[kdx] = (jlong)product;
+
+ movq(yz_idx, Address(y, idx, Address::times_4, offset));
+ rorq(yz_idx, 32); // convert big-endian to little-endian
+ movq(product, x_xstart);
+ mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
+ movq(yz_idx, Address(z, idx, Address::times_4, offset));
+ rorq(yz_idx, 32); // convert big-endian to little-endian
+
+ add2_with_carry(rdx, product, carry, yz_idx);
+
+ movl(Address(z, idx, Address::times_4, offset+4), product);
+ shrq(product, 32);
+ movl(Address(z, idx, Address::times_4, offset), product);
+
+}
+
+/**
+ * Multiply 128 bit by 128 bit. Unrolled inner loop.
+ */
+void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx, Register jdx,
+ Register carry, Register product,
+ Register carry2) {
+ // jlong carry, x[], y[], z[];
+ // int kdx = ystart+1;
+ // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+ // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
+ // z[kdx+idx+1] = (jlong)product;
+ // jlong carry2 = (jlong)(product >>> 64);
+ // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
+ // z[kdx+idx] = (jlong)product;
+ // carry = (jlong)(product >>> 64);
+ // }
+ // idx += 2;
+ // if (idx > 0) {
+ // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
+ // z[kdx+idx] = (jlong)product;
+ // carry = (jlong)(product >>> 64);
+ // }
+ //
+
+ Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+ movl(jdx, idx);
+ andl(jdx, 0xFFFFFFFC);
+ shrl(jdx, 2);
+
+ bind(L_third_loop);
+ subl(jdx, 1);
+ jcc(Assembler::negative, L_third_loop_exit);
+ subl(idx, 4);
+
+ multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
+ movq(carry2, rdx);
+
+ multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
+ movq(carry, rdx);
+ jmp(L_third_loop);
+
+ bind (L_third_loop_exit);
+
+ andl (idx, 0x3);
+ jcc(Assembler::zero, L_post_third_loop_done);
+
+ Label L_check_1;
+ subl(idx, 2);
+ jcc(Assembler::negative, L_check_1);
+
+ multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
+ movq(carry, rdx);
+
+ bind (L_check_1);
+ addl (idx, 0x2);
+ andl (idx, 0x1);
+ subl(idx, 1);
+ jcc(Assembler::negative, L_post_third_loop_done);
+
+ movl(yz_idx, Address(y, idx, Address::times_4, 0));
+ movq(product, x_xstart);
+ mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
+ movl(yz_idx, Address(z, idx, Address::times_4, 0));
+
+ add2_with_carry(rdx, product, yz_idx, carry);
+
+ movl(Address(z, idx, Address::times_4, 0), product);
+ shrq(product, 32);
+
+ shlq(rdx, 32);
+ orq(product, rdx);
+ movq(carry, product);
+
+ bind(L_post_third_loop_done);
+}
+
+/**
+ * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
+ *
+ */
+void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
+ Register carry, Register carry2,
+ Register idx, Register jdx,
+ Register yz_idx1, Register yz_idx2,
+ Register tmp, Register tmp3, Register tmp4) {
+ assert(UseBMI2Instructions, "should be used only when BMI2 is available");
+
+ // jlong carry, x[], y[], z[];
+ // int kdx = ystart+1;
+ // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+ // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
+ // jlong carry2 = (jlong)(tmp3 >>> 64);
+ // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
+ // carry = (jlong)(tmp4 >>> 64);
+ // z[kdx+idx+1] = (jlong)tmp3;
+ // z[kdx+idx] = (jlong)tmp4;
+ // }
+ // idx += 2;
+ // if (idx > 0) {
+ // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
+ // z[kdx+idx] = (jlong)yz_idx1;
+ // carry = (jlong)(yz_idx1 >>> 64);
+ // }
+ //
+
+ Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+ movl(jdx, idx);
+ andl(jdx, 0xFFFFFFFC);
+ shrl(jdx, 2);
+
+ bind(L_third_loop);
+ subl(jdx, 1);
+ jcc(Assembler::negative, L_third_loop_exit);
+ subl(idx, 4);
+
+ movq(yz_idx1, Address(y, idx, Address::times_4, 8));
+ rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
+ movq(yz_idx2, Address(y, idx, Address::times_4, 0));
+ rorxq(yz_idx2, yz_idx2, 32);
+
+ mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
+ mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
+
+ movq(yz_idx1, Address(z, idx, Address::times_4, 8));
+ rorxq(yz_idx1, yz_idx1, 32);
+ movq(yz_idx2, Address(z, idx, Address::times_4, 0));
+ rorxq(yz_idx2, yz_idx2, 32);
+
+ if (VM_Version::supports_adx()) {
+ adcxq(tmp3, carry);
+ adoxq(tmp3, yz_idx1);
+
+ adcxq(tmp4, tmp);
+ adoxq(tmp4, yz_idx2);
+
+ movl(carry, 0); // does not affect flags
+ adcxq(carry2, carry);
+ adoxq(carry2, carry);
+ } else {
+ add2_with_carry(tmp4, tmp3, carry, yz_idx1);
+ add2_with_carry(carry2, tmp4, tmp, yz_idx2);
+ }
+ movq(carry, carry2);
+
+ movl(Address(z, idx, Address::times_4, 12), tmp3);
+ shrq(tmp3, 32);
+ movl(Address(z, idx, Address::times_4, 8), tmp3);
+
+ movl(Address(z, idx, Address::times_4, 4), tmp4);
+ shrq(tmp4, 32);
+ movl(Address(z, idx, Address::times_4, 0), tmp4);
+
+ jmp(L_third_loop);
+
+ bind (L_third_loop_exit);
+
+ andl (idx, 0x3);
+ jcc(Assembler::zero, L_post_third_loop_done);
+
+ Label L_check_1;
+ subl(idx, 2);
+ jcc(Assembler::negative, L_check_1);
+
+ movq(yz_idx1, Address(y, idx, Address::times_4, 0));
+ rorxq(yz_idx1, yz_idx1, 32);
+ mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
+ movq(yz_idx2, Address(z, idx, Address::times_4, 0));
+ rorxq(yz_idx2, yz_idx2, 32);
+
+ add2_with_carry(tmp4, tmp3, carry, yz_idx2);
+
+ movl(Address(z, idx, Address::times_4, 4), tmp3);
+ shrq(tmp3, 32);
+ movl(Address(z, idx, Address::times_4, 0), tmp3);
+ movq(carry, tmp4);
+
+ bind (L_check_1);
+ addl (idx, 0x2);
+ andl (idx, 0x1);
+ subl(idx, 1);
+ jcc(Assembler::negative, L_post_third_loop_done);
+ movl(tmp4, Address(y, idx, Address::times_4, 0));
+ mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
+ movl(tmp4, Address(z, idx, Address::times_4, 0));
+
+ add2_with_carry(carry2, tmp3, tmp4, carry);
+
+ movl(Address(z, idx, Address::times_4, 0), tmp3);
+ shrq(tmp3, 32);
+
+ shlq(carry2, 32);
+ orq(tmp3, carry2);
+ movq(carry, tmp3);
+
+ bind(L_post_third_loop_done);
+}
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * rdi: x
+ * rax: xlen
+ * rsi: y
+ * rcx: ylen
+ * r8: z
+ * r11: zlen
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+ Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
+ ShortBranchVerifier sbv(this);
+ assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
+
+ push(tmp1);
+ push(tmp2);
+ push(tmp3);
+ push(tmp4);
+ push(tmp5);
+
+ push(xlen);
+ push(zlen);
+
+ const Register idx = tmp1;
+ const Register kdx = tmp2;
+ const Register xstart = tmp3;
+
+ const Register y_idx = tmp4;
+ const Register carry = tmp5;
+ const Register product = xlen;
+ const Register x_xstart = zlen; // reuse register
+
+ // First Loop.
+ //
+ // final static long LONG_MASK = 0xffffffffL;
+ // int xstart = xlen - 1;
+ // int ystart = ylen - 1;
+ // long carry = 0;
+ // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+ // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+ // z[kdx] = (int)product;
+ // carry = product >>> 32;
+ // }
+ // z[xstart] = (int)carry;
+ //
+
+ movl(idx, ylen); // idx = ylen;
+ movl(kdx, zlen); // kdx = xlen+ylen;
+ xorq(carry, carry); // carry = 0;
+
+ Label L_done;
+
+ movl(xstart, xlen);
+ decrementl(xstart);
+ jcc(Assembler::negative, L_done);
+
+ multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
+
+ Label L_second_loop;
+ testl(kdx, kdx);
+ jcc(Assembler::zero, L_second_loop);
+
+ Label L_carry;
+ subl(kdx, 1);
+ jcc(Assembler::zero, L_carry);
+
+ movl(Address(z, kdx, Address::times_4, 0), carry);
+ shrq(carry, 32);
+ subl(kdx, 1);
+
+ bind(L_carry);
+ movl(Address(z, kdx, Address::times_4, 0), carry);
+
+ // Second and third (nested) loops.
+ //
+ // for (int i = xstart-1; i >= 0; i--) { // Second loop
+ // carry = 0;
+ // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+ // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+ // (z[k] & LONG_MASK) + carry;
+ // z[k] = (int)product;
+ // carry = product >>> 32;
+ // }
+ // z[i] = (int)carry;
+ // }
+ //
+ // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
+
+ const Register jdx = tmp1;
+
+ bind(L_second_loop);
+ xorl(carry, carry); // carry = 0;
+ movl(jdx, ylen); // j = ystart+1
+
+ subl(xstart, 1); // i = xstart-1;
+ jcc(Assembler::negative, L_done);
+
+ push (z);
+
+ Label L_last_x;
+ lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
+ subl(xstart, 1); // i = xstart-1;
+ jcc(Assembler::negative, L_last_x);
+
+ if (UseBMI2Instructions) {
+ movq(rdx, Address(x, xstart, Address::times_4, 0));
+ rorxq(rdx, rdx, 32); // convert big-endian to little-endian
+ } else {
+ movq(x_xstart, Address(x, xstart, Address::times_4, 0));
+ rorq(x_xstart, 32); // convert big-endian to little-endian
+ }
+
+ Label L_third_loop_prologue;
+ bind(L_third_loop_prologue);
+
+ push (x);
+ push (xstart);
+ push (ylen);
+
+
+ if (UseBMI2Instructions) {
+ multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
+ } else { // !UseBMI2Instructions
+ multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
+ }
+
+ pop(ylen);
+ pop(xlen);
+ pop(x);
+ pop(z);
+
+ movl(tmp3, xlen);
+ addl(tmp3, 1);
+ movl(Address(z, tmp3, Address::times_4, 0), carry);
+ subl(tmp3, 1);
+ jccb(Assembler::negative, L_done);
+
+ shrq(carry, 32);
+ movl(Address(z, tmp3, Address::times_4, 0), carry);
+ jmp(L_second_loop);
+
+ // Next infrequent code is moved outside loops.
+ bind(L_last_x);
+ if (UseBMI2Instructions) {
+ movl(rdx, Address(x, 0));
+ } else {
+ movl(x_xstart, Address(x, 0));
+ }
+ jmp(L_third_loop_prologue);
+
+ bind(L_done);
+
+ pop(zlen);
+ pop(xlen);
+
+ pop(tmp5);
+ pop(tmp4);
+ pop(tmp3);
+ pop(tmp2);
+ pop(tmp1);
+}
+#endif
+
/**
* Emits code to update CRC-32 with a byte value according to constants in table
*
diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp
index 3b3073e63..69c9e8aa3 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp
@@ -1221,6 +1221,28 @@ public:
XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
XMMRegister tmp4, Register tmp5, Register result);
+#ifdef _LP64
+ void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
+ void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+ Register y, Register y_idx, Register z,
+ Register carry, Register product,
+ Register idx, Register kdx);
+ void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx,
+ Register carry, Register product, int offset);
+ void multiply_128_x_128_bmi2_loop(Register y, Register z,
+ Register carry, Register carry2,
+ Register idx, Register jdx,
+ Register yz_idx1, Register yz_idx2,
+ Register tmp, Register tmp3, Register tmp4);
+ void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx, Register jdx,
+ Register carry, Register product,
+ Register carry2);
+ void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+ Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+#endif
+
// CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
void update_byte_crc32(Register crc, Register val, Register table);
void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 0adb0d31e..0000146f5 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -3677,6 +3677,70 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
+
+ /**
+ * Arguments:
+ *
+ * Input:
+ * c_rarg0 - x address
+ * c_rarg1 - x length
+ * c_rarg2 - y address
+ * c_rarg3 - y lenth
+ * not Win64
+ * c_rarg4 - z address
+ * c_rarg5 - z length
+ * Win64
+ * rsp+40 - z address
+ * rsp+48 - z length
+ */
+ address generate_multiplyToLen() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+
+ address start = __ pc();
+ // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+ // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+ const Register x = rdi;
+ const Register xlen = rax;
+ const Register y = rsi;
+ const Register ylen = rcx;
+ const Register z = r8;
+ const Register zlen = r11;
+
+ // Next registers will be saved on stack in multiply_to_len().
+ const Register tmp1 = r12;
+ const Register tmp2 = r13;
+ const Register tmp3 = r14;
+ const Register tmp4 = r15;
+ const Register tmp5 = rbx;
+
+ BLOCK_COMMENT("Entry:");
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifndef _WIN64
+ __ movptr(zlen, r9); // Save r9 in r11 - zlen
+#endif
+ setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
+ // ylen => rcx, z => r8, zlen => r11
+ // r9 and r10 may be used to save non-volatile registers
+#ifdef _WIN64
+ // last 2 arguments (#4, #5) are on stack on Win64
+ __ movptr(z, Address(rsp, 6 * wordSize));
+ __ movptr(zlen, Address(rsp, 7 * wordSize));
+#endif
+
+ __ movptr(xlen, rsi);
+ __ movptr(y, rdx);
+ __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
+
+ restore_arg_regs();
+
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
#undef __
#define __ masm->
@@ -3917,6 +3981,11 @@ class StubGenerator: public StubCodeGenerator {
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
&StubRoutines::_safefetchN_fault_pc,
&StubRoutines::_safefetchN_continuation_pc);
+#ifdef COMPILER2
+ if (UseMultiplyToLenIntrinsic) {
+ StubRoutines::_multiplyToLen = generate_multiplyToLen();
+ }
+#endif
}
public:
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
index 8cb93b2cc..adfdfd67e 100644
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -493,7 +493,7 @@ void VM_Version::get_processor_features() {
}
char buf[256];
- jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@@ -522,7 +522,8 @@ void VM_Version::get_processor_features() {
(supports_tscinv_bit() ? ", tscinvbit": ""),
(supports_tscinv() ? ", tscinv": ""),
(supports_bmi1() ? ", bmi1" : ""),
- (supports_bmi2() ? ", bmi2" : ""));
+ (supports_bmi2() ? ", bmi2" : ""),
+ (supports_adx() ? ", adx" : ""));
_features_str = strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what
@@ -574,7 +575,7 @@ void VM_Version::get_processor_features() {
}
} else if (UseCRC32Intrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics))
- warning("CRC32 Intrinsics requires AVX and CLMUL instructions (not available on this CPU)");
+ warning("CRC32 Intrinsics requires CLMUL instructions (not available on this CPU)");
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
@@ -612,6 +613,17 @@ void VM_Version::get_processor_features() {
#if INCLUDE_RTM_OPT
if (UseRTMLocking) {
+ if (is_intel_family_core()) {
+ if ((_model == CPU_MODEL_HASWELL_E3) ||
+ (_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) ||
+ (_model == CPU_MODEL_BROADWELL && _stepping < 4)) {
+ if (!UnlockExperimentalVMOptions) {
+ vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag.");
+ } else {
+ warning("UseRTMLocking is only available as experimental option on this platform.");
+ }
+ }
+ }
if (!FLAG_IS_CMDLINE(UseRTMLocking)) {
// RTM locking should be used only for applications with
// high lock contention. For now we do not use it by default.
@@ -686,7 +698,20 @@ void VM_Version::get_processor_features() {
}
#endif
}
+
+#ifdef _LP64
+ if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+ UseMultiplyToLenIntrinsic = true;
+ }
+#else
+ if (UseMultiplyToLenIntrinsic) {
+ if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+ warning("multiplyToLen intrinsic is not available in 32-bit VM");
+ }
+ FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
+ }
#endif
+#endif // COMPILER2
// On new cpus instructions which update whole XMM register should be used
// to prevent partial register stall due to dependencies on high half.
@@ -829,6 +854,9 @@ void VM_Version::get_processor_features() {
}
}
}
+ if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
+ AllocatePrefetchInstr = 3;
+ }
}
// Use count leading zeros count instruction if available.
@@ -841,23 +869,40 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false);
}
+ // Use count trailing zeros instruction if available
if (supports_bmi1()) {
+ // tzcnt does not require VEX prefix
+ if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
+ if (!UseBMI1Instructions && !FLAG_IS_DEFAULT(UseBMI1Instructions)) {
+ // Don't use tzcnt if BMI1 is switched off on command line.
+ UseCountTrailingZerosInstruction = false;
+ } else {
+ UseCountTrailingZerosInstruction = true;
+ }
+ }
+ } else if (UseCountTrailingZerosInstruction) {
+ warning("tzcnt instruction is not available on this CPU");
+ FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+ }
+
+ // BMI instructions (except tzcnt) use an encoding with VEX prefix.
+ // VEX prefix is generated only when AVX > 0.
+ if (supports_bmi1() && supports_avx()) {
if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
UseBMI1Instructions = true;
}
} else if (UseBMI1Instructions) {
- warning("BMI1 instructions are not available on this CPU");
+ warning("BMI1 instructions are not available on this CPU (AVX is also required)");
FLAG_SET_DEFAULT(UseBMI1Instructions, false);
}
- // Use count trailing zeros instruction if available
- if (supports_bmi1()) {
- if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
- UseCountTrailingZerosInstruction = UseBMI1Instructions;
+ if (supports_bmi2() && supports_avx()) {
+ if (FLAG_IS_DEFAULT(UseBMI2Instructions)) {
+ UseBMI2Instructions = true;
}
- } else if (UseCountTrailingZerosInstruction) {
- warning("tzcnt instruction is not available on this CPU");
- FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+ } else if (UseBMI2Instructions) {
+ warning("BMI2 instructions are not available on this CPU (AVX is also required)");
+ FLAG_SET_DEFAULT(UseBMI2Instructions, false);
}
// Use population count instruction if available.
diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp
index 51f6e4f2f..1ad94e38b 100644
--- a/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/src/cpu/x86/vm/vm_version_x86.hpp
@@ -209,7 +209,9 @@ public:
erms : 1,
: 1,
rtm : 1,
- : 20;
+ : 7,
+ adx : 1,
+ : 12;
} bits;
};
@@ -260,7 +262,8 @@ protected:
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
CPU_BMI1 = (1 << 22),
CPU_BMI2 = (1 << 23),
- CPU_RTM = (1 << 24) // Restricted Transactional Memory instructions
+ CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
+ CPU_ADX = (1 << 25)
} cpuFeatureFlags;
enum {
@@ -276,7 +279,10 @@ protected:
CPU_MODEL_WESTMERE_EX = 0x2f,
CPU_MODEL_SANDYBRIDGE = 0x2a,
CPU_MODEL_SANDYBRIDGE_EP = 0x2d,
- CPU_MODEL_IVYBRIDGE_EP = 0x3a
+ CPU_MODEL_IVYBRIDGE_EP = 0x3a,
+ CPU_MODEL_HASWELL_E3 = 0x3c,
+ CPU_MODEL_HASWELL_E7 = 0x3f,
+ CPU_MODEL_BROADWELL = 0x3d
} cpuExtendedFamily;
// cpuid information block. All info derived from executing cpuid with
@@ -462,10 +468,16 @@ protected:
}
// Intel features.
if(is_intel()) {
+ if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
+ result |= CPU_ADX;
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
result |= CPU_BMI2;
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
result |= CPU_LZCNT;
+ // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
+ if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
+ result |= CPU_3DNOW_PREFETCH;
+ }
}
return result;
@@ -618,6 +630,7 @@ public:
static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; }
static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; }
static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; }
+ static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }