Merge with default before merge with jdk8u60

--HG-- branch : 8u40-verified-fixes
author: Denis S. Fokin <Denis.Fokin@gmail.com> 2015-10-29 13:33:55 +0300
committer: Denis S. Fokin <Denis.Fokin@gmail.com> 2015-10-29 13:33:55 +0300
commit: 24a09c463b2d328b6fc90ee555023514efca507c (patch)
tree: 9323dfc9d767b6af41ec7ed408cc32eb5775b93e /src/cpu/x86/vm
parent: 07ad54003f167bf4f38b23fdc8a677a37dc91ea9 (diff)
parent: b6bff51db351a645334f3b9097637e0e0a76dbed (diff)
download: jdk8u_hotspot-24a09c463b2d328b6fc90ee555023514efca507c.tar.gz
10 files changed, 719 insertions, 27 deletions
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 342192775..8098e889b 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -4937,6 +4937,26 @@ void Assembler::addq(Register dst, Register src) {
   emit_arith(0x03, 0xC0, dst, src);
 }
 
+void Assembler::adcxq(Register dst, Register src) {
+  //assert(VM_Version::supports_adx(), "adx instructions not supported");
+  emit_int8((unsigned char)0x66);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_int8(0x0F);
+  emit_int8(0x38);
+  emit_int8((unsigned char)0xF6);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::adoxq(Register dst, Register src) {
+  //assert(VM_Version::supports_adx(), "adx instructions not supported");
+  emit_int8((unsigned char)0xF3);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_int8(0x0F);
+  emit_int8(0x38);
+  emit_int8((unsigned char)0xF6);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::andq(Address dst, int32_t imm32) {
   InstructionMark im(this);
   prefixq(dst);
@@ -5444,6 +5464,26 @@ void Assembler::movzwq(Register dst, Register src) {
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::mulq(Address src) {
+  InstructionMark im(this);
+  prefixq(src);
+  emit_int8((unsigned char)0xF7);
+  emit_operand(rsp, src);
+}
+
+void Assembler::mulq(Register src) {
+  int encode = prefixq_and_encode(src->encoding());
+  emit_int8((unsigned char)0xF7);
+  emit_int8((unsigned char)(0xE0 | encode));
+}
+
+void Assembler::mulxq(Register dst1, Register dst2, Register src) {
+  assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false);
+  emit_int8((unsigned char)0xF6);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::negq(Register dst) {
   int encode = prefixq_and_encode(dst->encoding());
   emit_int8((unsigned char)0xF7);
@@ -5572,6 +5612,28 @@ void Assembler::rclq(Register dst, int imm8) {
     emit_int8(imm8);
   }
 }
+
+void Assembler::rorq(Register dst, int imm8) {
+  assert(isShiftCount(imm8 >> 1), "illegal shift count");
+  int encode = prefixq_and_encode(dst->encoding());
+  if (imm8 == 1) {
+    emit_int8((unsigned char)0xD1);
+    emit_int8((unsigned char)(0xC8 | encode));
+  } else {
+    emit_int8((unsigned char)0xC1);
+    emit_int8((unsigned char)(0xc8 | encode));
+    emit_int8(imm8);
+  }
+}
+
+void Assembler::rorxq(Register dst, Register src, int imm8) {
+  assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false);
+  emit_int8((unsigned char)0xF0);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8);
+}
+
 void Assembler::sarq(Register dst, int imm8) {
   assert(isShiftCount(imm8 >> 1), "illegal shift count");
   int encode = prefixq_and_encode(dst->encoding());
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index 8edf31cad..2ac9df8c9 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -888,6 +888,14 @@ private:
   void addq(Register dst, Address src);
   void addq(Register dst, Register src);
 
+#ifdef _LP64
+ //Add Unsigned Integers with Carry Flag
+  void adcxq(Register dst, Register src);
+
+ //Add Unsigned Integers with Overflow Flag
+  void adoxq(Register dst, Register src);
+#endif
+
   void addr_nop_4();
   void addr_nop_5();
   void addr_nop_7();
@@ -1204,19 +1212,20 @@ private:
   void idivl(Register src);
   void divl(Register src); // Unsigned division
 
+#ifdef _LP64
   void idivq(Register src);
+#endif
 
   void imull(Register dst, Register src);
   void imull(Register dst, Register src, int value);
   void imull(Register dst, Address src);
 
+#ifdef _LP64
   void imulq(Register dst, Register src);
   void imulq(Register dst, Register src, int value);
-#ifdef _LP64
   void imulq(Register dst, Address src);
 #endif
 
-
   // jcc is the generic conditional branch generator to run-
   // time routines, jcc is used for branches to labels. jcc
   // takes a branch opcode (cc) and a label (L) and generates
@@ -1408,9 +1417,16 @@ private:
   void movzwq(Register dst, Register src);
 #endif
 
+  // Unsigned multiply with RAX destination register
   void mull(Address src);
   void mull(Register src);
 
+#ifdef _LP64
+  void mulq(Address src);
+  void mulq(Register src);
+  void mulxq(Register dst1, Register dst2, Register src);
+#endif
+
   // Multiply Scalar Double-Precision Floating-Point Values
   void mulsd(XMMRegister dst, Address src);
   void mulsd(XMMRegister dst, XMMRegister src);
@@ -1541,6 +1557,11 @@ private:
 
   void ret(int imm16);
 
+#ifdef _LP64
+  void rorq(Register dst, int imm8);
+  void rorxq(Register dst, Register src, int imm8);
+#endif
+
   void sahf();
 
   void sarl(Register dst, int imm8);
diff --git a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
index b4e8223e1..1ee690ea4 100644
--- a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@@ -1085,14 +1085,11 @@ void LIRGenerator::do_Convert(Convert* x) {
 
 
 void LIRGenerator::do_NewInstance(NewInstance* x) {
-#ifndef PRODUCT
-  if (PrintNotLoaded && !x->klass()->is_loaded()) {
-    tty->print_cr("   ###class not loaded at new bci %d", x->printable_bci());
-  }
-#endif
+  print_if_not_loaded(x);
+
   CodeEmitInfo* info = state_for(x, x->state());
   LIR_Opr reg = result_register_for(x->type());
-  new_instance(reg, x->klass(),
+  new_instance(reg, x->klass(), x->is_unresolved(),
                        FrameMap::rcx_oop_opr,
                        FrameMap::rdi_oop_opr,
                        FrameMap::rsi_oop_opr,
diff --git a/src/cpu/x86/vm/c1_Runtime1_x86.cpp b/src/cpu/x86/vm/c1_Runtime1_x86.cpp
index fd6302d21..76303c114 100644
--- a/src/cpu/x86/vm/c1_Runtime1_x86.cpp
+++ b/src/cpu/x86/vm/c1_Runtime1_x86.cpp
@@ -675,7 +675,7 @@ OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler *sasm) {
   case handle_exception_nofpu_id:
   case handle_exception_id:
     // At this point all registers MAY be live.
-    oop_map = save_live_registers(sasm, 1 /*thread*/, id == handle_exception_nofpu_id);
+    oop_map = save_live_registers(sasm, 1 /*thread*/, id != handle_exception_nofpu_id);
     break;
   case handle_exception_from_callee_id: {
     // At this point all registers except exception oop (RAX) and
@@ -748,7 +748,7 @@ OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler *sasm) {
   case handle_exception_nofpu_id:
   case handle_exception_id:
     // Restore the registers that were saved at the beginning.
-    restore_live_registers(sasm, id == handle_exception_nofpu_id);
+    restore_live_registers(sasm, id != handle_exception_nofpu_id);
     break;
   case handle_exception_from_callee_id:
     // WIN64_ONLY: No need to add frame::arg_reg_save_area_bytes to SP
diff --git a/src/cpu/x86/vm/globals_x86.hpp b/src/cpu/x86/vm/globals_x86.hpp
index 5b34293ea..1401997b3 100644
--- a/src/cpu/x86/vm/globals_x86.hpp
+++ b/src/cpu/x86/vm/globals_x86.hpp
@@ -176,6 +176,8 @@ define_pd_global(uintx, TypeProfileLevel, 111);
           "Use count trailing zeros instruction")                           \
                                                                             \
   product(bool, UseBMI1Instructions, false,                                 \
-          "Use BMI instructions")
-
+          "Use BMI1 instructions")                                          \
+                                                                            \
+  product(bool, UseBMI2Instructions, false,                                 \
+          "Use BMI2 instructions")
 #endif // CPU_X86_VM_GLOBALS_X86_HPP
diff --git a/src/cpu/x86/vm/macroAssembler_x86.cpp b/src/cpu/x86/vm/macroAssembler_x86.cpp
index 7216c1980..5857a9350 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.cpp
@@ -1769,7 +1769,7 @@ void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg
     // at [FETCH], below, will never observe a biased encoding (*101b).
     // If this invariant is not held we risk exclusion (safety) failure.
     if (UseBiasedLocking && !UseOptoBiasInlining) {
-      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
+      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
     }
 
 #if INCLUDE_RTM_OPT
@@ -7293,6 +7293,467 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
   bind(L_done);
 }
 
+#ifdef _LP64
+/**
+ * Helper for multiply_to_len().
+ */
+void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
+  addq(dest_lo, src1);
+  adcq(dest_hi, 0);
+  addq(dest_lo, src2);
+  adcq(dest_hi, 0);
+}
+
+/**
+ * Multiply 64 bit by 64 bit first loop.
+ */
+void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                                           Register y, Register y_idx, Register z,
+                                           Register carry, Register product,
+                                           Register idx, Register kdx) {
+  //
+  //  jlong carry, x[], y[], z[];
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+  //    huge_128 product = y[idx] * x[xstart] + carry;
+  //    z[kdx] = (jlong)product;
+  //    carry  = (jlong)(product >>> 64);
+  //  }
+  //  z[xstart] = carry;
+  //
+
+  Label L_first_loop, L_first_loop_exit;
+  Label L_one_x, L_one_y, L_multiply;
+
+  decrementl(xstart);
+  jcc(Assembler::negative, L_one_x);
+
+  movq(x_xstart, Address(x, xstart, Address::times_4,  0));
+  rorq(x_xstart, 32); // convert big-endian to little-endian
+
+  bind(L_first_loop);
+  decrementl(idx);
+  jcc(Assembler::negative, L_first_loop_exit);
+  decrementl(idx);
+  jcc(Assembler::negative, L_one_y);
+  movq(y_idx, Address(y, idx, Address::times_4,  0));
+  rorq(y_idx, 32); // convert big-endian to little-endian
+  bind(L_multiply);
+  movq(product, x_xstart);
+  mulq(y_idx); // product(rax) * y_idx -> rdx:rax
+  addq(product, carry);
+  adcq(rdx, 0);
+  subl(kdx, 2);
+  movl(Address(z, kdx, Address::times_4,  4), product);
+  shrq(product, 32);
+  movl(Address(z, kdx, Address::times_4,  0), product);
+  movq(carry, rdx);
+  jmp(L_first_loop);
+
+  bind(L_one_y);
+  movl(y_idx, Address(y,  0));
+  jmp(L_multiply);
+
+  bind(L_one_x);
+  movl(x_xstart, Address(x,  0));
+  jmp(L_first_loop);
+
+  bind(L_first_loop_exit);
+}
+
+/**
+ * Multiply 64 bit by 64 bit and add 128 bit.
+ */
+void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+                                            Register yz_idx, Register idx,
+                                            Register carry, Register product, int offset) {
+  //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
+  //     z[kdx] = (jlong)product;
+
+  movq(yz_idx, Address(y, idx, Address::times_4,  offset));
+  rorq(yz_idx, 32); // convert big-endian to little-endian
+  movq(product, x_xstart);
+  mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
+  movq(yz_idx, Address(z, idx, Address::times_4,  offset));
+  rorq(yz_idx, 32); // convert big-endian to little-endian
+
+  add2_with_carry(rdx, product, carry, yz_idx);
+
+  movl(Address(z, idx, Address::times_4,  offset+4), product);
+  shrq(product, 32);
+  movl(Address(z, idx, Address::times_4,  offset), product);
+
+}
+
+/**
+ * Multiply 128 bit by 128 bit. Unrolled inner loop.
+ */
+void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+                                             Register yz_idx, Register idx, Register jdx,
+                                             Register carry, Register product,
+                                             Register carry2) {
+  //   jlong carry, x[], y[], z[];
+  //   int kdx = ystart+1;
+  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+  //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
+  //     z[kdx+idx+1] = (jlong)product;
+  //     jlong carry2  = (jlong)(product >>> 64);
+  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
+  //     z[kdx+idx] = (jlong)product;
+  //     carry  = (jlong)(product >>> 64);
+  //   }
+  //   idx += 2;
+  //   if (idx > 0) {
+  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
+  //     z[kdx+idx] = (jlong)product;
+  //     carry  = (jlong)(product >>> 64);
+  //   }
+  //
+
+  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+  movl(jdx, idx);
+  andl(jdx, 0xFFFFFFFC);
+  shrl(jdx, 2);
+
+  bind(L_third_loop);
+  subl(jdx, 1);
+  jcc(Assembler::negative, L_third_loop_exit);
+  subl(idx, 4);
+
+  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
+  movq(carry2, rdx);
+
+  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
+  movq(carry, rdx);
+  jmp(L_third_loop);
+
+  bind (L_third_loop_exit);
+
+  andl (idx, 0x3);
+  jcc(Assembler::zero, L_post_third_loop_done);
+
+  Label L_check_1;
+  subl(idx, 2);
+  jcc(Assembler::negative, L_check_1);
+
+  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
+  movq(carry, rdx);
+
+  bind (L_check_1);
+  addl (idx, 0x2);
+  andl (idx, 0x1);
+  subl(idx, 1);
+  jcc(Assembler::negative, L_post_third_loop_done);
+
+  movl(yz_idx, Address(y, idx, Address::times_4,  0));
+  movq(product, x_xstart);
+  mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
+  movl(yz_idx, Address(z, idx, Address::times_4,  0));
+
+  add2_with_carry(rdx, product, yz_idx, carry);
+
+  movl(Address(z, idx, Address::times_4,  0), product);
+  shrq(product, 32);
+
+  shlq(rdx, 32);
+  orq(product, rdx);
+  movq(carry, product);
+
+  bind(L_post_third_loop_done);
+}
+
+/**
+ * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
+ *
+ */
+void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
+                                                  Register carry, Register carry2,
+                                                  Register idx, Register jdx,
+                                                  Register yz_idx1, Register yz_idx2,
+                                                  Register tmp, Register tmp3, Register tmp4) {
+  assert(UseBMI2Instructions, "should be used only when BMI2 is available");
+
+  //   jlong carry, x[], y[], z[];
+  //   int kdx = ystart+1;
+  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+  //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
+  //     jlong carry2  = (jlong)(tmp3 >>> 64);
+  //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
+  //     carry  = (jlong)(tmp4 >>> 64);
+  //     z[kdx+idx+1] = (jlong)tmp3;
+  //     z[kdx+idx] = (jlong)tmp4;
+  //   }
+  //   idx += 2;
+  //   if (idx > 0) {
+  //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
+  //     z[kdx+idx] = (jlong)yz_idx1;
+  //     carry  = (jlong)(yz_idx1 >>> 64);
+  //   }
+  //
+
+  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+  movl(jdx, idx);
+  andl(jdx, 0xFFFFFFFC);
+  shrl(jdx, 2);
+
+  bind(L_third_loop);
+  subl(jdx, 1);
+  jcc(Assembler::negative, L_third_loop_exit);
+  subl(idx, 4);
+
+  movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
+  rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
+  movq(yz_idx2, Address(y, idx, Address::times_4,  0));
+  rorxq(yz_idx2, yz_idx2, 32);
+
+  mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
+  mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
+
+  movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
+  rorxq(yz_idx1, yz_idx1, 32);
+  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
+  rorxq(yz_idx2, yz_idx2, 32);
+
+  if (VM_Version::supports_adx()) {
+    adcxq(tmp3, carry);
+    adoxq(tmp3, yz_idx1);
+
+    adcxq(tmp4, tmp);
+    adoxq(tmp4, yz_idx2);
+
+    movl(carry, 0); // does not affect flags
+    adcxq(carry2, carry);
+    adoxq(carry2, carry);
+  } else {
+    add2_with_carry(tmp4, tmp3, carry, yz_idx1);
+    add2_with_carry(carry2, tmp4, tmp, yz_idx2);
+  }
+  movq(carry, carry2);
+
+  movl(Address(z, idx, Address::times_4, 12), tmp3);
+  shrq(tmp3, 32);
+  movl(Address(z, idx, Address::times_4,  8), tmp3);
+
+  movl(Address(z, idx, Address::times_4,  4), tmp4);
+  shrq(tmp4, 32);
+  movl(Address(z, idx, Address::times_4,  0), tmp4);
+
+  jmp(L_third_loop);
+
+  bind (L_third_loop_exit);
+
+  andl (idx, 0x3);
+  jcc(Assembler::zero, L_post_third_loop_done);
+
+  Label L_check_1;
+  subl(idx, 2);
+  jcc(Assembler::negative, L_check_1);
+
+  movq(yz_idx1, Address(y, idx, Address::times_4,  0));
+  rorxq(yz_idx1, yz_idx1, 32);
+  mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
+  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
+  rorxq(yz_idx2, yz_idx2, 32);
+
+  add2_with_carry(tmp4, tmp3, carry, yz_idx2);
+
+  movl(Address(z, idx, Address::times_4,  4), tmp3);
+  shrq(tmp3, 32);
+  movl(Address(z, idx, Address::times_4,  0), tmp3);
+  movq(carry, tmp4);
+
+  bind (L_check_1);
+  addl (idx, 0x2);
+  andl (idx, 0x1);
+  subl(idx, 1);
+  jcc(Assembler::negative, L_post_third_loop_done);
+  movl(tmp4, Address(y, idx, Address::times_4,  0));
+  mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
+  movl(tmp4, Address(z, idx, Address::times_4,  0));
+
+  add2_with_carry(carry2, tmp3, tmp4, carry);
+
+  movl(Address(z, idx, Address::times_4,  0), tmp3);
+  shrq(tmp3, 32);
+
+  shlq(carry2, 32);
+  orq(tmp3, carry2);
+  movq(carry, tmp3);
+
+  bind(L_post_third_loop_done);
+}
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * rdi: x
+ * rax: xlen
+ * rsi: y
+ * rcx: ylen
+ * r8:  z
+ * r11: zlen
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
+  ShortBranchVerifier sbv(this);
+  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
+
+  push(tmp1);
+  push(tmp2);
+  push(tmp3);
+  push(tmp4);
+  push(tmp5);
+
+  push(xlen);
+  push(zlen);
+
+  const Register idx = tmp1;
+  const Register kdx = tmp2;
+  const Register xstart = tmp3;
+
+  const Register y_idx = tmp4;
+  const Register carry = tmp5;
+  const Register product  = xlen;
+  const Register x_xstart = zlen;  // reuse register
+
+  // First Loop.
+  //
+  //  final static long LONG_MASK = 0xffffffffL;
+  //  int xstart = xlen - 1;
+  //  int ystart = ylen - 1;
+  //  long carry = 0;
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+  //    z[kdx] = (int)product;
+  //    carry = product >>> 32;
+  //  }
+  //  z[xstart] = (int)carry;
+  //
+
+  movl(idx, ylen);      // idx = ylen;
+  movl(kdx, zlen);      // kdx = xlen+ylen;
+  xorq(carry, carry);   // carry = 0;
+
+  Label L_done;
+
+  movl(xstart, xlen);
+  decrementl(xstart);
+  jcc(Assembler::negative, L_done);
+
+  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
+
+  Label L_second_loop;
+  testl(kdx, kdx);
+  jcc(Assembler::zero, L_second_loop);
+
+  Label L_carry;
+  subl(kdx, 1);
+  jcc(Assembler::zero, L_carry);
+
+  movl(Address(z, kdx, Address::times_4,  0), carry);
+  shrq(carry, 32);
+  subl(kdx, 1);
+
+  bind(L_carry);
+  movl(Address(z, kdx, Address::times_4,  0), carry);
+
+  // Second and third (nested) loops.
+  //
+  // for (int i = xstart-1; i >= 0; i--) { // Second loop
+  //   carry = 0;
+  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+  //                    (z[k] & LONG_MASK) + carry;
+  //     z[k] = (int)product;
+  //     carry = product >>> 32;
+  //   }
+  //   z[i] = (int)carry;
+  // }
+  //
+  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
+
+  const Register jdx = tmp1;
+
+  bind(L_second_loop);
+  xorl(carry, carry);    // carry = 0;
+  movl(jdx, ylen);       // j = ystart+1
+
+  subl(xstart, 1);       // i = xstart-1;
+  jcc(Assembler::negative, L_done);
+
+  push (z);
+
+  Label L_last_x;
+  lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
+  subl(xstart, 1);       // i = xstart-1;
+  jcc(Assembler::negative, L_last_x);
+
+  if (UseBMI2Instructions) {
+    movq(rdx,  Address(x, xstart, Address::times_4,  0));
+    rorxq(rdx, rdx, 32); // convert big-endian to little-endian
+  } else {
+    movq(x_xstart, Address(x, xstart, Address::times_4,  0));
+    rorq(x_xstart, 32);  // convert big-endian to little-endian
+  }
+
+  Label L_third_loop_prologue;
+  bind(L_third_loop_prologue);
+
+  push (x);
+  push (xstart);
+  push (ylen);
+
+
+  if (UseBMI2Instructions) {
+    multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
+  } else { // !UseBMI2Instructions
+    multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
+  }
+
+  pop(ylen);
+  pop(xlen);
+  pop(x);
+  pop(z);
+
+  movl(tmp3, xlen);
+  addl(tmp3, 1);
+  movl(Address(z, tmp3, Address::times_4,  0), carry);
+  subl(tmp3, 1);
+  jccb(Assembler::negative, L_done);
+
+  shrq(carry, 32);
+  movl(Address(z, tmp3, Address::times_4,  0), carry);
+  jmp(L_second_loop);
+
+  // Next infrequent code is moved outside loops.
+  bind(L_last_x);
+  if (UseBMI2Instructions) {
+    movl(rdx, Address(x,  0));
+  } else {
+    movl(x_xstart, Address(x,  0));
+  }
+  jmp(L_third_loop_prologue);
+
+  bind(L_done);
+
+  pop(zlen);
+  pop(xlen);
+
+  pop(tmp5);
+  pop(tmp4);
+  pop(tmp3);
+  pop(tmp2);
+  pop(tmp1);
+}
+#endif
+
 /**
  * Emits code to update CRC-32 with a byte value according to constants in table
  *
diff --git a/src/cpu/x86/vm/macroAssembler_x86.hpp b/src/cpu/x86/vm/macroAssembler_x86.hpp
index 3b3073e63..69c9e8aa3 100644
--- a/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/src/cpu/x86/vm/macroAssembler_x86.hpp
@@ -1221,6 +1221,28 @@ public:
                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
                         XMMRegister tmp4, Register tmp5, Register result);
 
+#ifdef _LP64
+  void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
+  void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                             Register y, Register y_idx, Register z,
+                             Register carry, Register product,
+                             Register idx, Register kdx);
+  void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+                              Register yz_idx, Register idx,
+                              Register carry, Register product, int offset);
+  void multiply_128_x_128_bmi2_loop(Register y, Register z,
+                                    Register carry, Register carry2,
+                                    Register idx, Register jdx,
+                                    Register yz_idx1, Register yz_idx2,
+                                    Register tmp, Register tmp3, Register tmp4);
+  void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+                               Register yz_idx, Register idx, Register jdx,
+                               Register carry, Register product,
+                               Register carry2);
+  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+                       Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+#endif
+
   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
   void update_byte_crc32(Register crc, Register val, Register table);
   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 0adb0d31e..0000146f5 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -3677,6 +3677,70 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
+
+  /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - x address
+   *    c_rarg1   - x length
+   *    c_rarg2   - y address
+   *    c_rarg3   - y lenth
+   * not Win64
+   *    c_rarg4   - z address
+   *    c_rarg5   - z length
+   * Win64
+   *    rsp+40    - z address
+   *    rsp+48    - z length
+   */
+  address generate_multiplyToLen() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+
+    address start = __ pc();
+    // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register x     = rdi;
+    const Register xlen  = rax;
+    const Register y     = rsi;
+    const Register ylen  = rcx;
+    const Register z     = r8;
+    const Register zlen  = r11;
+
+    // Next registers will be saved on stack in multiply_to_len().
+    const Register tmp1  = r12;
+    const Register tmp2  = r13;
+    const Register tmp3  = r14;
+    const Register tmp4  = r15;
+    const Register tmp5  = rbx;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifndef _WIN64
+    __ movptr(zlen, r9); // Save r9 in r11 - zlen
+#endif
+    setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
+                       // ylen => rcx, z => r8, zlen => r11
+                       // r9 and r10 may be used to save non-volatile registers
+#ifdef _WIN64
+    // last 2 arguments (#4, #5) are on stack on Win64
+    __ movptr(z, Address(rsp, 6 * wordSize));
+    __ movptr(zlen, Address(rsp, 7 * wordSize));
+#endif
+
+    __ movptr(xlen, rsi);
+    __ movptr(y,    rdx);
+    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
+
+    restore_arg_regs();
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    return start;
+  }
+
 #undef __
 #define __ masm->
 
@@ -3917,6 +3981,11 @@ class StubGenerator: public StubCodeGenerator {
     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
                                                        &StubRoutines::_safefetchN_fault_pc,
                                                        &StubRoutines::_safefetchN_continuation_pc);
+#ifdef COMPILER2
+    if (UseMultiplyToLenIntrinsic) {
+      StubRoutines::_multiplyToLen = generate_multiplyToLen();
+    }
+#endif
   }
 
  public:
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
index 8cb93b2cc..adfdfd67e 100644
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -493,7 +493,7 @@ void VM_Version::get_processor_features() {
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -522,7 +522,8 @@ void VM_Version::get_processor_features() {
                (supports_tscinv_bit() ? ", tscinvbit": ""),
                (supports_tscinv() ? ", tscinv": ""),
                (supports_bmi1() ? ", bmi1" : ""),
-               (supports_bmi2() ? ", bmi2" : ""));
+               (supports_bmi2() ? ", bmi2" : ""),
+               (supports_adx() ? ", adx" : ""));
   _features_str = strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
@@ -574,7 +575,7 @@ void VM_Version::get_processor_features() {
     }
   } else if (UseCRC32Intrinsics) {
     if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics))
-      warning("CRC32 Intrinsics requires AVX and CLMUL instructions (not available on this CPU)");
+      warning("CRC32 Intrinsics requires CLMUL instructions (not available on this CPU)");
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
   }
 
@@ -612,6 +613,17 @@ void VM_Version::get_processor_features() {
 
 #if INCLUDE_RTM_OPT
   if (UseRTMLocking) {
+    if (is_intel_family_core()) {
+      if ((_model == CPU_MODEL_HASWELL_E3) ||
+          (_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) ||
+          (_model == CPU_MODEL_BROADWELL  && _stepping < 4)) {
+        if (!UnlockExperimentalVMOptions) {
+          vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag.");
+        } else {
+          warning("UseRTMLocking is only available as experimental option on this platform.");
+        }
+      }
+    }
     if (!FLAG_IS_CMDLINE(UseRTMLocking)) {
       // RTM locking should be used only for applications with
       // high lock contention. For now we do not use it by default.
@@ -686,7 +698,20 @@ void VM_Version::get_processor_features() {
     }
 #endif
   }
+
+#ifdef _LP64
+  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+    UseMultiplyToLenIntrinsic = true;
+  }
+#else
+  if (UseMultiplyToLenIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+      warning("multiplyToLen intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
+  }
 #endif
+#endif // COMPILER2
 
   // On new cpus instructions which update whole XMM register should be used
   // to prevent partial register stall due to dependencies on high half.
@@ -829,6 +854,9 @@ void VM_Version::get_processor_features() {
         }
       }
     }
+    if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
+      AllocatePrefetchInstr = 3;
+    }
   }
 
   // Use count leading zeros count instruction if available.
@@ -841,23 +869,40 @@ void VM_Version::get_processor_features() {
     FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false);
   }
 
+  // Use count trailing zeros instruction if available
   if (supports_bmi1()) {
+    // tzcnt does not require VEX prefix
+    if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
+      if (!UseBMI1Instructions && !FLAG_IS_DEFAULT(UseBMI1Instructions)) {
+        // Don't use tzcnt if BMI1 is switched off on command line.
+        UseCountTrailingZerosInstruction = false;
+      } else {
+        UseCountTrailingZerosInstruction = true;
+      }
+    }
+  } else if (UseCountTrailingZerosInstruction) {
+    warning("tzcnt instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+  }
+
+  // BMI instructions (except tzcnt) use an encoding with VEX prefix.
+  // VEX prefix is generated only when AVX > 0.
+  if (supports_bmi1() && supports_avx()) {
     if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
       UseBMI1Instructions = true;
     }
   } else if (UseBMI1Instructions) {
-    warning("BMI1 instructions are not available on this CPU");
+    warning("BMI1 instructions are not available on this CPU (AVX is also required)");
     FLAG_SET_DEFAULT(UseBMI1Instructions, false);
   }
 
-  // Use count trailing zeros instruction if available
-  if (supports_bmi1()) {
-    if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
-      UseCountTrailingZerosInstruction = UseBMI1Instructions;
+  if (supports_bmi2() && supports_avx()) {
+    if (FLAG_IS_DEFAULT(UseBMI2Instructions)) {
+      UseBMI2Instructions = true;
     }
-  } else if (UseCountTrailingZerosInstruction) {
-    warning("tzcnt instruction is not available on this CPU");
-    FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+  } else if (UseBMI2Instructions) {
+    warning("BMI2 instructions are not available on this CPU (AVX is also required)");
+    FLAG_SET_DEFAULT(UseBMI2Instructions, false);
   }
 
   // Use population count instruction if available.
diff --git a/src/cpu/x86/vm/vm_version_x86.hpp b/src/cpu/x86/vm/vm_version_x86.hpp
index 51f6e4f2f..1ad94e38b 100644
--- a/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/src/cpu/x86/vm/vm_version_x86.hpp
@@ -209,7 +209,9 @@ public:
                    erms : 1,
                         : 1,
                    rtm  : 1,
-                        : 20;
+                        : 7,
+                   adx  : 1,
+                        : 12;
     } bits;
   };
 
@@ -260,7 +262,8 @@ protected:
     CPU_CLMUL  = (1 << 21), // carryless multiply for CRC
     CPU_BMI1   = (1 << 22),
     CPU_BMI2   = (1 << 23),
-    CPU_RTM    = (1 << 24)  // Restricted Transactional Memory instructions
+    CPU_RTM    = (1 << 24),  // Restricted Transactional Memory instructions
+    CPU_ADX    = (1 << 25)
   } cpuFeatureFlags;
 
   enum {
@@ -276,7 +279,10 @@ protected:
     CPU_MODEL_WESTMERE_EX    = 0x2f,
     CPU_MODEL_SANDYBRIDGE    = 0x2a,
     CPU_MODEL_SANDYBRIDGE_EP = 0x2d,
-    CPU_MODEL_IVYBRIDGE_EP   = 0x3a
+    CPU_MODEL_IVYBRIDGE_EP   = 0x3a,
+    CPU_MODEL_HASWELL_E3     = 0x3c,
+    CPU_MODEL_HASWELL_E7     = 0x3f,
+    CPU_MODEL_BROADWELL      = 0x3d
   } cpuExtendedFamily;
 
   // cpuid information block.  All info derived from executing cpuid with
@@ -462,10 +468,16 @@ protected:
     }
     // Intel features.
     if(is_intel()) {
+      if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
+         result |= CPU_ADX;
       if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
         result |= CPU_BMI2;
       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
         result |= CPU_LZCNT;
+      // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
+      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
+        result |= CPU_3DNOW_PREFETCH;
+      }
     }
 
     return result;
@@ -618,6 +630,7 @@ public:
   static bool supports_rtm()      { return (_cpuFeatures & CPU_RTM) != 0; }
   static bool supports_bmi1()     { return (_cpuFeatures & CPU_BMI1) != 0; }
   static bool supports_bmi2()     { return (_cpuFeatures & CPU_BMI2) != 0; }
+  static bool supports_adx()     { return (_cpuFeatures & CPU_ADX) != 0; }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
author	Denis S. Fokin <Denis.Fokin@gmail.com>	2015-10-29 13:33:55 +0300
committer	Denis S. Fokin <Denis.Fokin@gmail.com>	2015-10-29 13:33:55 +0300
commit	24a09c463b2d328b6fc90ee555023514efca507c (patch)
tree	9323dfc9d767b6af41ec7ed408cc32eb5775b93e /src/cpu/x86/vm
parent	07ad54003f167bf4f38b23fdc8a677a37dc91ea9 (diff)
parent	b6bff51db351a645334f3b9097637e0e0a76dbed (diff)
download	jdk8u_hotspot-24a09c463b2d328b6fc90ee555023514efca507c.tar.gz