aboutsummaryrefslogtreecommitdiff
path: root/src/cpu
diff options
context:
space:
mode:
authorascarpino <unknown>2015-06-17 17:48:25 -0700
committerbell-sw <liberica@bell-sw.com>2020-01-19 09:12:36 +0300
commit1b68125360866b989f96f59998e56e5c88a1d1d3 (patch)
tree9403d00ef088d985c127fb1a40cfe61a869ed3c6 /src/cpu
parent4272c7cbfa33d827ca9418a3480388d67e6bebc7 (diff)
downloadjdk8u_hotspot-1b68125360866b989f96f59998e56e5c88a1d1d3.tar.gz
8073108: Use x86 and SPARC CPU instructions for GHASH acceleration
Reviewed-by: kvn, jrose, phh
Diffstat (limited to 'src/cpu')
-rw-r--r--src/cpu/ppc/vm/vm_version_ppc.cpp5
-rw-r--r--src/cpu/sparc/vm/assembler_sparc.hpp8
-rw-r--r--src/cpu/sparc/vm/stubGenerator_sparc.cpp128
-rw-r--r--src/cpu/sparc/vm/vm_version_sparc.cpp11
-rw-r--r--src/cpu/x86/vm/assembler_x86.cpp9
-rw-r--r--src/cpu/x86/vm/assembler_x86.hpp2
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_32.cpp168
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_64.cpp176
-rw-r--r--src/cpu/x86/vm/stubRoutines_x86.cpp4
-rw-r--r--src/cpu/x86/vm/stubRoutines_x86.hpp7
-rw-r--r--src/cpu/x86/vm/vm_version_x86.cpp11
11 files changed, 526 insertions, 3 deletions
diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp
index bbc52691b..3c59c96ca 100644
--- a/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/src/cpu/ppc/vm/vm_version_ppc.cpp
@@ -194,6 +194,11 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ if (UseGHASHIntrinsics) {
+ warning("GHASH intrinsics are not available on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (has_vshasig()) {
if (FLAG_IS_DEFAULT(UseSHA)) {
UseSHA = true;
diff --git a/src/cpu/sparc/vm/assembler_sparc.hpp b/src/cpu/sparc/vm/assembler_sparc.hpp
index dd83b092f..55f338754 100644
--- a/src/cpu/sparc/vm/assembler_sparc.hpp
+++ b/src/cpu/sparc/vm/assembler_sparc.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,7 @@ class Assembler : public AbstractAssembler {
flog3_op3 = 0x36,
edge_op3 = 0x36,
fsrc_op3 = 0x36,
+ xmulx_op3 = 0x36,
impdep2_op3 = 0x37,
stpartialf_op3 = 0x37,
jmpl_op3 = 0x38,
@@ -220,6 +221,8 @@ class Assembler : public AbstractAssembler {
mdtox_opf = 0x110,
mstouw_opf = 0x111,
mstosw_opf = 0x113,
+ xmulx_opf = 0x115,
+ xmulxhi_opf = 0x116,
mxtod_opf = 0x118,
mwtos_opf = 0x119,
@@ -1212,6 +1215,9 @@ public:
void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
+ void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+ void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
// Crypto SHA instructions
void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
diff --git a/src/cpu/sparc/vm/stubGenerator_sparc.cpp b/src/cpu/sparc/vm/stubGenerator_sparc.cpp
index 6ad0b1a41..51064927e 100644
--- a/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ b/src/cpu/sparc/vm/stubGenerator_sparc.cpp
@@ -4788,6 +4788,130 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_aligned, L_main;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ Register state = I0;
+ Register subkeyH = I1;
+ Register data = I2;
+ Register len = I3;
+
+ __ save_frame(0);
+
+ __ ldx(state, 0, O0);
+ __ ldx(state, 8, O1);
+
+ // Loop label for multiblock operations
+ __ BIND(L_ghash_loop);
+
+ // Check if 'data' is unaligned
+ __ andcc(data, 7, G1);
+ __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+ __ delayed()->nop();
+
+ Register left_shift = L1;
+ Register right_shift = L2;
+ Register data_ptr = L3;
+
+ // Get left and right shift values in bits
+ __ sll(G1, LogBitsPerByte, left_shift);
+ __ mov(64, right_shift);
+ __ sub(right_shift, left_shift, right_shift);
+
+ // Align to read 'data'
+ __ sub(data, G1, data_ptr);
+
+ // Load first 8 bytes of 'data'
+ __ ldx(data_ptr, 0, O4);
+ __ sllx(O4, left_shift, O4);
+ __ ldx(data_ptr, 8, O5);
+ __ srlx(O5, right_shift, G4);
+ __ bset(G4, O4);
+
+ // Load second 8 bytes of 'data'
+ __ sllx(O5, left_shift, O5);
+ __ ldx(data_ptr, 16, G4);
+ __ srlx(G4, right_shift, G4);
+ __ ba(L_main);
+ __ delayed()->bset(G4, O5);
+
+ // If 'data' is aligned, load normally
+ __ BIND(L_aligned);
+ __ ldx(data, 0, O4);
+ __ ldx(data, 8, O5);
+
+ __ BIND(L_main);
+ __ ldx(subkeyH, 0, O2);
+ __ ldx(subkeyH, 8, O3);
+
+ __ xor3(O0, O4, O0);
+ __ xor3(O1, O5, O1);
+
+ __ xmulxhi(O0, O3, G3);
+ __ xmulx(O0, O2, O5);
+ __ xmulxhi(O1, O2, G4);
+ __ xmulxhi(O1, O3, G5);
+ __ xmulx(O0, O3, G1);
+ __ xmulx(O1, O3, G2);
+ __ xmulx(O1, O2, O3);
+ __ xmulxhi(O0, O2, O4);
+
+ __ mov(0xE1, O0);
+ __ sllx(O0, 56, O0);
+
+ __ xor3(O5, G3, O5);
+ __ xor3(O5, G4, O5);
+ __ xor3(G5, G1, G1);
+ __ xor3(G1, O3, G1);
+ __ srlx(G2, 63, O1);
+ __ srlx(G1, 63, G3);
+ __ sllx(G2, 63, O3);
+ __ sllx(G2, 58, O2);
+ __ xor3(O3, O2, O2);
+
+ __ sllx(G1, 1, G1);
+ __ or3(G1, O1, G1);
+
+ __ xor3(G1, O2, G1);
+
+ __ sllx(G2, 1, G2);
+
+ __ xmulxhi(G1, O0, O1);
+ __ xmulx(G1, O0, O2);
+ __ xmulxhi(G2, O0, O3);
+ __ xmulx(G2, O0, G1);
+
+ __ xor3(O4, O1, O4);
+ __ xor3(O5, O2, O5);
+ __ xor3(O5, O3, O5);
+
+ __ sllx(O4, 1, O2);
+ __ srlx(O5, 63, O3);
+
+ __ or3(O2, O3, O0);
+
+ __ sllx(O5, 1, O1);
+ __ srlx(G1, 63, O2);
+ __ or3(O1, O2, O1);
+ __ xor3(O1, G3, O1);
+
+ __ deccc(len);
+ __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+ __ delayed()->add(data, 16, data);
+
+ __ stx(O0, I0, 0);
+ __ stx(O1, I0, 8);
+
+ __ ret();
+ __ delayed()->restore();
+
+ return start;
+ }
+
void generate_initial() {
// Generates all stubs and initializes the entry points
@@ -4860,6 +4984,10 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
// generate SHA1/SHA256/SHA512 intrinsics code
if (UseSHA1Intrinsics) {
diff --git a/src/cpu/sparc/vm/vm_version_sparc.cpp b/src/cpu/sparc/vm/vm_version_sparc.cpp
index c0cd16a18..793dc184a 100644
--- a/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/src/cpu/sparc/vm/vm_version_sparc.cpp
@@ -319,6 +319,17 @@ void VM_Version::initialize() {
}
}
+ // GHASH/GCM intrinsics
+ if (has_vis3() && (UseVIS > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
if (has_sha1() || has_sha256() || has_sha512()) {
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 7cbc47d60..1759ecdfd 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -2575,6 +2575,15 @@ void Assembler::psrldq(XMMRegister dst, int shift) {
emit_int8(shift);
}
+void Assembler::pslldq(XMMRegister dst, int shift) {
+ // Shift left 128 bit value in xmm register by number of bytes.
+ NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+ int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66);
+ emit_int8(0x73);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(shift);
+}
+
void Assembler::ptest(XMMRegister dst, Address src) {
assert(VM_Version::supports_sse4_1(), "");
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index 341d9e39b..5ea01311e 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -1527,6 +1527,8 @@ private:
// Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift);
+ // Shift Left by bytes Logical DoubleQuadword Immediate
+ void pslldq(XMMRegister dst, int shift);
// Logical Compare 128bit
void ptest(XMMRegister dst, XMMRegister src);
diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
index 50a06d7a5..235cdb7ed 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -2719,6 +2719,167 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0b0a0908, relocInfo::none, 0);
+ __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+ __ emit_data(0x03020100, relocInfo::none, 0);
+ __ emit_data(0x07060504, relocInfo::none, 0);
+
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = rdi;
+ const Register subkeyH = rsi;
+ const Register data = rdx;
+ const Register blocks = rcx;
+
+ const Address state_param(rbp, 8+0);
+ const Address subkeyH_param(rbp, 8+4);
+ const Address data_param(rbp, 8+8);
+ const Address blocks_param(rbp, 8+12);
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+
+ __ enter();
+
+ __ movptr(state, state_param);
+ __ movptr(subkeyH, subkeyH_param);
+ __ movptr(data, data_param);
+ __ movptr(blocks, blocks_param);
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp6);
+ __ pslld (xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp4, 31);
+ __ movdqu(xmm_temp5, xmm_temp7);
+ __ pslldq(xmm_temp4, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp5, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp4);
+ __ por(xmm_temp6, xmm_temp5);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp4, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp5, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp5);
+ __ movdqu(xmm_temp4, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp4, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp7, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp4);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ // Byte swap 16-byte result
+ __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -3018,6 +3179,13 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 1d38af799..c5811b28b 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -3639,6 +3639,175 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
+
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+ __ emit_data64(0x0706050403020100, relocInfo::none );
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+ __ emit_data64(0x0001020304050607, relocInfo::none );
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = c_rarg0;
+ const Register subkeyH = c_rarg1;
+ const Register data = c_rarg2;
+ const Register blocks = c_rarg3;
+
+#ifdef _WIN64
+ const int XMM_REG_LAST = 10;
+#endif
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+ const XMMRegister xmm_temp8 = xmm8;
+ const XMMRegister xmm_temp9 = xmm9;
+ const XMMRegister xmm_temp10 = xmm10;
+
+ __ enter();
+
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-10
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+#endif
+
+ __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, xmm_temp10);
+
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, xmm_temp10);
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp6);
+ __ pslld(xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp8, 31);
+ __ movdqu(xmm_temp9, xmm_temp7);
+ __ pslldq(xmm_temp8, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp9, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp8);
+ __ por(xmm_temp6, xmm_temp9);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp3);
+ __ movdqu(xmm_temp9, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp9);
+ __ movdqu(xmm_temp8, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp8, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp4, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp8);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+#endif
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -4077,6 +4246,13 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
diff --git a/src/cpu/x86/vm/stubRoutines_x86.cpp b/src/cpu/x86/vm/stubRoutines_x86.cpp
index 200f2aff8..9b0d8fc75 100644
--- a/src/cpu/x86/vm/stubRoutines_x86.cpp
+++ b/src/cpu/x86/vm/stubRoutines_x86.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
diff --git a/src/cpu/x86/vm/stubRoutines_x86.hpp b/src/cpu/x86/vm/stubRoutines_x86.hpp
index d8e52ab3b..bb160486c 100644
--- a/src/cpu/x86/vm/stubRoutines_x86.hpp
+++ b/src/cpu/x86/vm/stubRoutines_x86.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
+ // swap mask for ghash
+ static address _ghash_long_swap_mask_addr;
+ static address _ghash_byte_swap_mask_addr;
public:
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
+ static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+ static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
diff --git a/src/cpu/x86/vm/vm_version_x86.cpp b/src/cpu/x86/vm/vm_version_x86.cpp
index fd0a68d10..1f5ae757a 100644
--- a/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/src/cpu/x86/vm/vm_version_x86.cpp
@@ -594,6 +594,17 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ // GHASH/GCM intrinsics
+ if (UseCLMUL && (UseSSE > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);