summaryrefslogtreecommitdiff
path: root/mac-x86_64
diff options
context:
space:
mode:
authorAdam Langley <agl@chromium.org>2016-01-13 15:00:54 -0800
committerAdam Langley <agl@chromium.org>2016-01-14 13:09:44 -0800
commit4139edb02e59e7ad48e0a8f4c02e45923bc8a344 (patch)
tree80b47f41b8e3971267452f49e48560c9c36434e2 /mac-x86_64
parent55181dbbcdc86b9abed8bd900f1041344211663c (diff)
downloadboringssl-4139edb02e59e7ad48e0a8f4c02e45923bc8a344.tar.gz
external/boringssl: sync to 7b8b9c17
This includes the following changes from BoringSSL : 7b8b9c1 Include 'asm' in the name of X25519 asm sources. 3202750 Update the fuzz tests for the server. 6544426 Fix a ** 0 mod 1 = 0 for real this time. fe5f7c7 Only reserve EVP_MAX_MD_SIZE for the Finished, not twice of it. 0d56f88 Switch s to ssl everywhere. 974c7ba Route DHE through the SSL_ECDH abstraction as well. 4cc36ad Make it possible to tell what curve was used on the server. 4298d77 Implement draft-ietf-tls-curve25519-01 in C. c18ef75 Allocate a NID for X25519. 3a2a480 Remove long-dead comment. cba2b62 Implement draft-ietf-tls-curve25519-01 in Go. ab14563 Bundle a copy of golang.org/x/crypto/curve25519 for testing. a029ebc Switch the bundled poly1305 to relative imports. 64d9250 Completely remove P-224 from the TLS stack. 8c2b3bf Test all supported curves (including those off by default). fc82512 Convert ssl3_send_cert_verify to CBB. 5fb18c6 Make MSVC happy. 2a0b391 Rewrite ssl3_send_server_key_exchange to use CBB. d16bf34 Add a -lldb flag to runner.go. af21bcf Remove other unnecessary BN_CTX allocations. ae0eaaa Convert ssl3_send_client_key_exchange to CBB. 3ac4b3a Remove NO_ASM define that I accidently included in the previous commit. e6c5402 Don't build X25519 asm code when NO_ASM is set. 77a173e Add x86-64 assembly for X25519. c75c0ae Add #defines for ED25519 key and signature lengths. 48cce66 Tidy up ssl3_get_server_key_exchange slightly. c1cc858 Check for EC_KEY_set_public_key error. 4cc671c Add CBB_reserve and CBB_did_write. e13263d Resolve a few old TODOs. 841934f Remove stack macros for nonexistent types. 70ab223 Remove ASN1_R_MALLOC_FAILURE. b965c63 Reject calls to X509_verify_cert that have not been reinitialised 3f5b43d Simplify RSA key exchange padding check. 3ef6085 Refuse to parse RSA pubkeys with invalid exponents. afe57cb Add a tool to generate Ed25519 keys. 77c3c0b Enable Ed25519 when building with OPENSSL_SMALL. 9f897b2 Remove the stitched RC4-MD5 code and use the generic one. 1741a9d Save some mallocs in computing the MAC for e_tls.c. df57163 Add RC4-SHA1 and DES-EDE3-CBC-SHA1 to bssl speed. 13414b3 Implement draft-ietf-tls-chacha20-poly1305-04. 3748990 Implement draft-ietf-tls-chacha20-poly1305-04 in Go. 2089fdd Implement RFC 7539 in Go. 86e412d Add client cert support to bssl client. 23a681b Fix build. e320392 Rename the Go ChaCha20-Poly1305 implementation. 8ffab72 Point EVP_aead_chacha20_poly1305 at the standardized version. fef6fb5 Fix ChaCha20-Poly1305 tests. 60a08ac Remove unreachable code to duplicate DH keys. 4ec0cce Slightly tweak some array allocations. 2936170 Fix memory leak in DSA redo case. a01deee Make CBB_len relative to its argument. 77385bb Mark platform-specific HOST_[c2l|l2c] as (void). 6969971 Remove a dead prototype. 1b36716 Remove crypto/header_removed.h. 017231a Remove asm __asm__ define. 793c21e Make HOST_l2c return void. 0aff3ff Store the partial block as uint8_t, not uint32_t. 5a19d7d Use the straight-forward ROTATE macro. 78fefbf Reformat md32_common.h, part 2. fea1137 Reformat md32_common.h, part 1. 871fff0 *_Update of length zero is legal. d9f0671 Remove |need_record_splitting| from |SSL3_STATE|. cd48038 Remove unused fields from SSL3_STATE. 7fc0100 Slightly simplify SSL3_RECORD. ece5ba2 Reset ssl error codes. a41280d Pull ChangeCipherSpec into the handshake state machine. 8fd5c23 Simplify fragmented HelloRequest state. ef5dfd2 Add tests for malformed HelloRequests. 8411b24 Add tests for bad ChangeCipherSpecs. 502a843 Switch unrolled loop in BN_usub with memcpy. c3ae38b Remove DH EVP_PKEY hooks. 7100ee9 Chromium's update.sh is dead, long live update.py f28dd64 Fix flaky BadRSAClientKeyExchange-1 test. 4234885 Remove unused functions. 45dab25 Skip free callbacks on empty CRYPTO_EX_DATAs. 8a58933 Remove the CRYPTO_EX_new callback. 0abd6f2 Get struct timeval from sys/time.h. 1246670 Use UINT64_C in sha512.c table. 5ddffbb Make SSL_(CTX_)?set_tmp_ecdh call SSL_(CTX_)?set1_curves. 53e5c2c Remove SSL_(CTX_)?set_ecdh_callback. 756ad17 Initialize |one_index| in OAEP padding check. 1634a33 Convert rsa/padding.c to constant-time helpers. b36a395 Add slightly better RSA key exchange tests. 0bd71eb Remove weird ret negation logic. e9cddb8 Remove SSL_OP_LEGACY_SERVER_CONNECT. 3e052de Tighten SSL_OP_LEGACY_SERVER_CONNECT to align with RFC 5746. 03f0005 Remove SSL_OP_MICROSOFT_BIG_SSLV3_BUFFER. ef5e515 Remove SSL_OP_TLS_D5_BUG. c100ef4 Limit depth of ASN1 parse printing. 2205093 Add a comment in SetTestState from bssl_shim. 6ae67df Don't leak Android hacks to other build platforms. a0ef7b0 Enforce that |EC_KEY| private key is in [0, group->order). 533a273 Add |EC_METHOD| method for verifying public key order. a3d9de0 Add |EC_GROUP_get0_order| to replace |EC_GROUP_get_order|. 8847856 Include <sys/time.h> in packeted_bio.h for 'timeval' dca63cf Don't abort in |init_once| if |fcntl| returns ENOSYS afd565f Add defines for SRTP profiles using GCM ciphers from RFC 7714. 902870e Gate SHA_CTX compatibility on !WINDOWS. 34aa55c Support the SHA_CTX hack without ANDROID. 6d9e5a7 Re-apply 75b833cc819a9d189adb0fdd56327bee600ff9e9 28243c0 Add PSS parameter check. e701f16 bn/asm/x86_64-mont5.pl: fix carry propagating bug (CVE-2015-3193). cb85298 Fix leak with ASN.1 combine. c4f25ce Work around yaSSL bug. c5eb467 Remove dead code in p256-x86_64. 758d127 Add get0 getters for EVP_PKEY. fde89b4 avoid clashes with libc's 'open' in e_chacha20poly1305.c 60a45aa Remove reference to removed |RSA_FLAG_NO_CONSTTIME| flag. 81edc9b Do away with BN_LLONG in favor of BN_ULLONG. e8fe07f Fix AES XTS mode key size. 93a5b44 Make CRYPTO_library_init use a CRYPTO_once_t. bf76218 Remove the |ri| field of |BN_MONT_CTX|. 596ab10 s/BN_BITS/BN_BITS2/ in |BN_mod_inverse_ex|; remove |BN_BITS| & |BN_MASK|. 7af36e1 Share common definitions of |TOBN| and |BIGNUM_STATIC|. ff2df33 Reformat the cipher suite table. 9f2e277 Remove strength_bits. d6e9eec Remove algo_strength. dcb6ef0 Remove algorithm_ssl. d28f59c Switch the keylog BIO to a callback. fba735c Register the *25519 tests as dependencies of all_tests. f3376ac Remove |EC_POINTs_mul| & simplify p256-x86_64. 301efc8 Fix error handling in |p256-x86_64|. e2136d9 Remove |EC_GROUP_precompute_mult| and |EC_KEY_precompute_mult|. 9b26297 Make |EC_GROUP_precompute_mult|/|EC_KEY_precompute_mult| no-ops. 5058d79 Remove p224-64 and p256-64 dead code for non-default generators. b1b6229 Add NEON implementation of curve25519. 9e65d48 Allow |CRYPTO_is_NEON_capable| to be known at compile time, if possible. 3ac32b1 Fix curve25519 code for MSVC. 4fb0dc4 Add X25519 and Ed25519 support. c324f17 Make sure pthread_once() succeeds. 9361243 Don't include <alloca.h>, it's no longer needed. b00061c Add SSL_CIPHER_is_AES[128|256]CBC. 3a59611 size_t SSL*_use_*_ASN1. b324159 Fix ssl3_send_server_key_exchange error path. f584a5a Reset epoch state in one place. 2077cf9 Use UINT64_C instead of OPENSSL_U64. af07365 Check for overflow when parsing a CBS with d2i_*. 780cd92 modes/asm/ghash-armv4.pl: extend Apple fix to all clang cases. f9c77de Drop CBB allocation failure test. a33915d Have |CBB_init| zero the |CBB| before any possible failures. c5c85de Make RAND_seed read a byte of random data. d9e2702 Don't encode or decode ∞. e7806fd Remove point-on-curve check from |ec_GFp_simple_oct2point|. 20c3731 Become partially -Wmissing-variable-declarations-clean. 7308aaa Remove `EC_GFp_simple_method` (dead code). f872951 Fix null pointer dereference when using "simple" EC. 8bde5d2 Remove the unused |Ni| member of |BN_MONT_CTX|. ce7ae6f Enable AVX code for SHA-*. 9f1f04f Remove nistz256 dead code for non-default generators. d7421eb Remove condition which always evaluates to true (size_t >= 0). d386394 Test for underflow before subtraction. ef14b2d Remove stl_compat.h. cd24a39 Limit DHE groups to 4096-bit. 99fdfb9 Move curve check out of tls12_check_peer_sigalg. Change-Id: Id2d7110569d250b1bae8f8ce7d4421a92f581a31
Diffstat (limited to 'mac-x86_64')
-rw-r--r--mac-x86_64/crypto/bn/x86_64-mont5.S9
-rw-r--r--mac-x86_64/crypto/ec/p256-x86_64-asm.S243
-rw-r--r--mac-x86_64/crypto/rc4/rc4-md5-x86_64.S1262
-rw-r--r--mac-x86_64/crypto/sha/sha1-x86_64.S1121
-rw-r--r--mac-x86_64/crypto/sha/sha256-x86_64.S1062
-rw-r--r--mac-x86_64/crypto/sha/sha512-x86_64.S2241
6 files changed, 4434 insertions, 1504 deletions
diff --git a/mac-x86_64/crypto/bn/x86_64-mont5.S b/mac-x86_64/crypto/bn/x86_64-mont5.S
index 2e8f469c..461bfb22 100644
--- a/mac-x86_64/crypto/bn/x86_64-mont5.S
+++ b/mac-x86_64/crypto/bn/x86_64-mont5.S
@@ -1560,6 +1560,15 @@ L$8x_tail:
.p2align 5
L$8x_tail_done:
addq (%rdx),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+
+
xorq %rax,%rax
negq %rsi
diff --git a/mac-x86_64/crypto/ec/p256-x86_64-asm.S b/mac-x86_64/crypto/ec/p256-x86_64-asm.S
index 43f7dffb..bed11305 100644
--- a/mac-x86_64/crypto/ec/p256-x86_64-asm.S
+++ b/mac-x86_64/crypto/ec/p256-x86_64-asm.S
@@ -7,10 +7,6 @@
L$poly:
.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
-
-L$RR:
-.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
-
L$One:
.long 1,1,1,1,1,1,1,1
L$Two:
@@ -20,11 +16,9 @@ L$Three:
L$ONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
-.globl _ecp_nistz256_mul_by_2
-.private_extern _ecp_nistz256_mul_by_2
.p2align 6
-_ecp_nistz256_mul_by_2:
+ecp_nistz256_mul_by_2:
pushq %r12
pushq %r13
@@ -65,228 +59,6 @@ _ecp_nistz256_mul_by_2:
-.globl _ecp_nistz256_div_by_2
-.private_extern _ecp_nistz256_div_by_2
-
-.p2align 5
-_ecp_nistz256_div_by_2:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq %r8,%rax
- movq 24(%rsi),%r11
- leaq L$poly(%rip),%rsi
-
- movq %r9,%rdx
- xorq %r13,%r13
- addq 0(%rsi),%r8
- movq %r10,%rcx
- adcq 8(%rsi),%r9
- adcq 16(%rsi),%r10
- movq %r11,%r12
- adcq 24(%rsi),%r11
- adcq $0,%r13
- xorq %rsi,%rsi
- testq $1,%rax
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- cmovzq %rcx,%r10
- cmovzq %r12,%r11
- cmovzq %rsi,%r13
-
- movq %r9,%rax
- shrq $1,%r8
- shlq $63,%rax
- movq %r10,%rdx
- shrq $1,%r9
- orq %rax,%r8
- shlq $63,%rdx
- movq %r11,%rcx
- shrq $1,%r10
- orq %rdx,%r9
- shlq $63,%rcx
- shrq $1,%r11
- shlq $63,%r13
- orq %rcx,%r10
- orq %r13,%r11
-
- movq %r8,0(%rdi)
- movq %r9,8(%rdi)
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-
-
-
-
-.globl _ecp_nistz256_mul_by_3
-.private_extern _ecp_nistz256_mul_by_3
-
-.p2align 5
-_ecp_nistz256_mul_by_3:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- xorq %r13,%r13
- movq 8(%rsi),%r9
- addq %r8,%r8
- movq 16(%rsi),%r10
- adcq %r9,%r9
- movq 24(%rsi),%r11
- movq %r8,%rax
- adcq %r10,%r10
- adcq %r11,%r11
- movq %r9,%rdx
- adcq $0,%r13
-
- subq $-1,%r8
- movq %r10,%rcx
- sbbq L$poly+8(%rip),%r9
- sbbq $0,%r10
- movq %r11,%r12
- sbbq L$poly+24(%rip),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- cmovzq %rcx,%r10
- cmovzq %r12,%r11
-
- xorq %r13,%r13
- addq 0(%rsi),%r8
- adcq 8(%rsi),%r9
- movq %r8,%rax
- adcq 16(%rsi),%r10
- adcq 24(%rsi),%r11
- movq %r9,%rdx
- adcq $0,%r13
-
- subq $-1,%r8
- movq %r10,%rcx
- sbbq L$poly+8(%rip),%r9
- sbbq $0,%r10
- movq %r11,%r12
- sbbq L$poly+24(%rip),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-
-
-
-
-.globl _ecp_nistz256_add
-.private_extern _ecp_nistz256_add
-
-.p2align 5
-_ecp_nistz256_add:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- xorq %r13,%r13
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- leaq L$poly(%rip),%rsi
-
- addq 0(%rdx),%r8
- adcq 8(%rdx),%r9
- movq %r8,%rax
- adcq 16(%rdx),%r10
- adcq 24(%rdx),%r11
- movq %r9,%rdx
- adcq $0,%r13
-
- subq 0(%rsi),%r8
- movq %r10,%rcx
- sbbq 8(%rsi),%r9
- sbbq 16(%rsi),%r10
- movq %r11,%r12
- sbbq 24(%rsi),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-
-
-
-
-.globl _ecp_nistz256_sub
-.private_extern _ecp_nistz256_sub
-
-.p2align 5
-_ecp_nistz256_sub:
- pushq %r12
- pushq %r13
-
- movq 0(%rsi),%r8
- xorq %r13,%r13
- movq 8(%rsi),%r9
- movq 16(%rsi),%r10
- movq 24(%rsi),%r11
- leaq L$poly(%rip),%rsi
-
- subq 0(%rdx),%r8
- sbbq 8(%rdx),%r9
- movq %r8,%rax
- sbbq 16(%rdx),%r10
- sbbq 24(%rdx),%r11
- movq %r9,%rdx
- sbbq $0,%r13
-
- addq 0(%rsi),%r8
- movq %r10,%rcx
- adcq 8(%rsi),%r9
- adcq 16(%rsi),%r10
- movq %r11,%r12
- adcq 24(%rsi),%r11
- testq %r13,%r13
-
- cmovzq %rax,%r8
- cmovzq %rdx,%r9
- movq %r8,0(%rdi)
- cmovzq %rcx,%r10
- movq %r9,8(%rdi)
- cmovzq %r12,%r11
- movq %r10,16(%rdi)
- movq %r11,24(%rdi)
-
- popq %r13
- popq %r12
- .byte 0xf3,0xc3
-
-
-
-
.globl _ecp_nistz256_neg
.private_extern _ecp_nistz256_neg
@@ -335,19 +107,6 @@ _ecp_nistz256_neg:
-.globl _ecp_nistz256_to_mont
-.private_extern _ecp_nistz256_to_mont
-
-.p2align 5
-_ecp_nistz256_to_mont:
- leaq L$RR(%rip),%rdx
- jmp L$mul_mont
-
-
-
-
-
-
.globl _ecp_nistz256_mul_mont
diff --git a/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S b/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S
deleted file mode 100644
index 31ee7d26..00000000
--- a/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S
+++ /dev/null
@@ -1,1262 +0,0 @@
-#if defined(__x86_64__)
-.text
-.p2align 4
-
-.globl _rc4_md5_enc
-.private_extern _rc4_md5_enc
-
-_rc4_md5_enc:
- cmpq $0,%r9
- je L$abort
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $40,%rsp
-L$body:
- movq %rcx,%r11
- movq %r9,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- movq %r8,%r15
- xorq %rbp,%rbp
- xorq %rcx,%rcx
-
- leaq 8(%rdi),%rdi
- movb -8(%rdi),%bpl
- movb -4(%rdi),%cl
-
- incb %bpl
- subq %r13,%r14
- movl (%rdi,%rbp,4),%eax
- addb %al,%cl
- leaq (%rdi,%rbp,4),%rsi
- shlq $6,%r12
- addq %r15,%r12
- movq %r12,16(%rsp)
-
- movq %r11,24(%rsp)
- movl 0(%r11),%r8d
- movl 4(%r11),%r9d
- movl 8(%r11),%r10d
- movl 12(%r11),%r11d
- jmp L$oop
-
-.p2align 4
-L$oop:
- movl %r8d,0(%rsp)
- movl %r9d,4(%rsp)
- movl %r10d,8(%rsp)
- movl %r11d,%r12d
- movl %r11d,12(%rsp)
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 0(%r15),%r8d
- addb %dl,%al
- movl 4(%rsi),%ebx
- addl $3614090360,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,0(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 4(%r15),%r11d
- addb %dl,%bl
- movl 8(%rsi),%eax
- addl $3905402710,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,4(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 8(%r15),%r10d
- addb %dl,%al
- movl 12(%rsi),%ebx
- addl $606105819,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,8(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 12(%r15),%r9d
- addb %dl,%bl
- movl 16(%rsi),%eax
- addl $3250441966,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,12(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r11d,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 16(%r15),%r8d
- addb %dl,%al
- movl 20(%rsi),%ebx
- addl $4118548399,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,16(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 20(%r15),%r11d
- addb %dl,%bl
- movl 24(%rsi),%eax
- addl $1200080426,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,20(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 24(%r15),%r10d
- addb %dl,%al
- movl 28(%rsi),%ebx
- addl $2821735955,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,24(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 28(%r15),%r9d
- addb %dl,%bl
- movl 32(%rsi),%eax
- addl $4249261313,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,28(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r11d,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 32(%r15),%r8d
- addb %dl,%al
- movl 36(%rsi),%ebx
- addl $1770035416,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,32(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 36(%r15),%r11d
- addb %dl,%bl
- movl 40(%rsi),%eax
- addl $2336552879,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,36(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 40(%r15),%r10d
- addb %dl,%al
- movl 44(%rsi),%ebx
- addl $4294925233,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,40(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 44(%r15),%r9d
- addb %dl,%bl
- movl 48(%rsi),%eax
- addl $2304563134,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,44(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r11d,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 48(%r15),%r8d
- addb %dl,%al
- movl 52(%rsi),%ebx
- addl $1804603682,%r8d
- xorl %r11d,%r12d
- movzbl %al,%eax
- movl %edx,48(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $7,%r8d
- movl %r10d,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 52(%r15),%r11d
- addb %dl,%bl
- movl 56(%rsi),%eax
- addl $4254626195,%r11d
- xorl %r10d,%r12d
- movzbl %bl,%ebx
- movl %edx,52(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $12,%r11d
- movl %r9d,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 56(%r15),%r10d
- addb %dl,%al
- movl 60(%rsi),%ebx
- addl $2792965006,%r10d
- xorl %r9d,%r12d
- movzbl %al,%eax
- movl %edx,56(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $17,%r10d
- movl %r8d,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu (%r13),%xmm2
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 60(%r15),%r9d
- addb %dl,%bl
- movl 64(%rsi),%eax
- addl $1236535329,%r9d
- xorl %r8d,%r12d
- movzbl %bl,%ebx
- movl %edx,60(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $22,%r9d
- movl %r10d,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- psllq $8,%xmm1
- pxor %xmm0,%xmm2
- pxor %xmm1,%xmm2
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 4(%r15),%r8d
- addb %dl,%al
- movl 68(%rsi),%ebx
- addl $4129170786,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,64(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 24(%r15),%r11d
- addb %dl,%bl
- movl 72(%rsi),%eax
- addl $3225465664,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,68(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 44(%r15),%r10d
- addb %dl,%al
- movl 76(%rsi),%ebx
- addl $643717713,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,72(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 0(%r15),%r9d
- addb %dl,%bl
- movl 80(%rsi),%eax
- addl $3921069994,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,76(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r10d,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 20(%r15),%r8d
- addb %dl,%al
- movl 84(%rsi),%ebx
- addl $3593408605,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,80(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 40(%r15),%r11d
- addb %dl,%bl
- movl 88(%rsi),%eax
- addl $38016083,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,84(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 60(%r15),%r10d
- addb %dl,%al
- movl 92(%rsi),%ebx
- addl $3634488961,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,88(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 16(%r15),%r9d
- addb %dl,%bl
- movl 96(%rsi),%eax
- addl $3889429448,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,92(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r10d,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 36(%r15),%r8d
- addb %dl,%al
- movl 100(%rsi),%ebx
- addl $568446438,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,96(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 56(%r15),%r11d
- addb %dl,%bl
- movl 104(%rsi),%eax
- addl $3275163606,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,100(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 12(%r15),%r10d
- addb %dl,%al
- movl 108(%rsi),%ebx
- addl $4107603335,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,104(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 32(%r15),%r9d
- addb %dl,%bl
- movl 112(%rsi),%eax
- addl $1163531501,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,108(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r10d,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r11d,%r12d
- addl 52(%r15),%r8d
- addb %dl,%al
- movl 116(%rsi),%ebx
- addl $2850285829,%r8d
- xorl %r10d,%r12d
- movzbl %al,%eax
- movl %edx,112(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $5,%r8d
- movl %r9d,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r10d,%r12d
- addl 8(%r15),%r11d
- addb %dl,%bl
- movl 120(%rsi),%eax
- addl $4243563512,%r11d
- xorl %r9d,%r12d
- movzbl %bl,%ebx
- movl %edx,116(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $9,%r11d
- movl %r8d,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- andl %r9d,%r12d
- addl 28(%r15),%r10d
- addb %dl,%al
- movl 124(%rsi),%ebx
- addl $1735328473,%r10d
- xorl %r8d,%r12d
- movzbl %al,%eax
- movl %edx,120(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $14,%r10d
- movl %r11d,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu 16(%r13),%xmm3
- addb $32,%bpl
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- andl %r8d,%r12d
- addl 48(%r15),%r9d
- addb %dl,%bl
- movl 0(%rdi,%rbp,4),%eax
- addl $2368359562,%r9d
- xorl %r11d,%r12d
- movzbl %bl,%ebx
- movl %edx,124(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $20,%r9d
- movl %r11d,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movq %rcx,%rsi
- xorq %rcx,%rcx
- movb %sil,%cl
- leaq (%rdi,%rbp,4),%rsi
- psllq $8,%xmm1
- pxor %xmm0,%xmm3
- pxor %xmm1,%xmm3
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 20(%r15),%r8d
- addb %dl,%al
- movl 4(%rsi),%ebx
- addl $4294588738,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,0(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 32(%r15),%r11d
- addb %dl,%bl
- movl 8(%rsi),%eax
- addl $2272392833,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,4(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 44(%r15),%r10d
- addb %dl,%al
- movl 12(%rsi),%ebx
- addl $1839030562,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,8(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 56(%r15),%r9d
- addb %dl,%bl
- movl 16(%rsi),%eax
- addl $4259657740,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,12(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl %r11d,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 4(%r15),%r8d
- addb %dl,%al
- movl 20(%rsi),%ebx
- addl $2763975236,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,16(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 16(%r15),%r11d
- addb %dl,%bl
- movl 24(%rsi),%eax
- addl $1272893353,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,20(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 28(%r15),%r10d
- addb %dl,%al
- movl 28(%rsi),%ebx
- addl $4139469664,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,24(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 40(%r15),%r9d
- addb %dl,%bl
- movl 32(%rsi),%eax
- addl $3200236656,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,28(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl %r11d,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 52(%r15),%r8d
- addb %dl,%al
- movl 36(%rsi),%ebx
- addl $681279174,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,32(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 0(%r15),%r11d
- addb %dl,%bl
- movl 40(%rsi),%eax
- addl $3936430074,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,36(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 12(%r15),%r10d
- addb %dl,%al
- movl 44(%rsi),%ebx
- addl $3572445317,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,40(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 24(%r15),%r9d
- addb %dl,%bl
- movl 48(%rsi),%eax
- addl $76029189,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,44(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl %r11d,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r9d,%r12d
- addl 36(%r15),%r8d
- addb %dl,%al
- movl 52(%rsi),%ebx
- addl $3654602809,%r8d
- movzbl %al,%eax
- addl %r12d,%r8d
- movl %edx,48(%rsi)
- addb %bl,%cl
- roll $4,%r8d
- movl %r10d,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r8d,%r12d
- addl 48(%r15),%r11d
- addb %dl,%bl
- movl 56(%rsi),%eax
- addl $3873151461,%r11d
- movzbl %bl,%ebx
- addl %r12d,%r11d
- movl %edx,52(%rsi)
- addb %al,%cl
- roll $11,%r11d
- movl %r9d,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %eax,(%rdi,%rcx,4)
- xorl %r11d,%r12d
- addl 60(%r15),%r10d
- addb %dl,%al
- movl 60(%rsi),%ebx
- addl $530742520,%r10d
- movzbl %al,%eax
- addl %r12d,%r10d
- movl %edx,56(%rsi)
- addb %bl,%cl
- roll $16,%r10d
- movl %r8d,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu 32(%r13),%xmm4
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- xorl %r10d,%r12d
- addl 8(%r15),%r9d
- addb %dl,%bl
- movl 64(%rsi),%eax
- addl $3299628645,%r9d
- movzbl %bl,%ebx
- addl %r12d,%r9d
- movl %edx,60(%rsi)
- addb %al,%cl
- roll $23,%r9d
- movl $-1,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- psllq $8,%xmm1
- pxor %xmm0,%xmm4
- pxor %xmm1,%xmm4
- pxor %xmm0,%xmm0
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 0(%r15),%r8d
- addb %dl,%al
- movl 68(%rsi),%ebx
- addl $4096336452,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,64(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- movd (%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- pxor %xmm1,%xmm1
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 28(%r15),%r11d
- addb %dl,%bl
- movl 72(%rsi),%eax
- addl $1126891415,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,68(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- movd (%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 56(%r15),%r10d
- addb %dl,%al
- movl 76(%rsi),%ebx
- addl $2878612391,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,72(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $1,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 20(%r15),%r9d
- addb %dl,%bl
- movl 80(%rsi),%eax
- addl $4237533241,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,76(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $1,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 48(%r15),%r8d
- addb %dl,%al
- movl 84(%rsi),%ebx
- addl $1700485571,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,80(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- pinsrw $2,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 12(%r15),%r11d
- addb %dl,%bl
- movl 88(%rsi),%eax
- addl $2399980690,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,84(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- pinsrw $2,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 40(%r15),%r10d
- addb %dl,%al
- movl 92(%rsi),%ebx
- addl $4293915773,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,88(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $3,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 4(%r15),%r9d
- addb %dl,%bl
- movl 96(%rsi),%eax
- addl $2240044497,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,92(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $3,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 32(%r15),%r8d
- addb %dl,%al
- movl 100(%rsi),%ebx
- addl $1873313359,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,96(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- pinsrw $4,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 60(%r15),%r11d
- addb %dl,%bl
- movl 104(%rsi),%eax
- addl $4264355552,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,100(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- pinsrw $4,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 24(%r15),%r10d
- addb %dl,%al
- movl 108(%rsi),%ebx
- addl $2734768916,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,104(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $5,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 52(%r15),%r9d
- addb %dl,%bl
- movl 112(%rsi),%eax
- addl $1309151649,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,108(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $5,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movl (%rdi,%rcx,4),%edx
- xorl %r11d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r9d,%r12d
- addl 16(%r15),%r8d
- addb %dl,%al
- movl 116(%rsi),%ebx
- addl $4149444226,%r8d
- movzbl %al,%eax
- xorl %r10d,%r12d
- movl %edx,112(%rsi)
- addl %r12d,%r8d
- addb %bl,%cl
- roll $6,%r8d
- movl $-1,%r12d
- pinsrw $6,(%rdi,%rax,4),%xmm0
-
- addl %r9d,%r8d
- movl (%rdi,%rcx,4),%edx
- xorl %r10d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r8d,%r12d
- addl 44(%r15),%r11d
- addb %dl,%bl
- movl 120(%rsi),%eax
- addl $3174756917,%r11d
- movzbl %bl,%ebx
- xorl %r9d,%r12d
- movl %edx,116(%rsi)
- addl %r12d,%r11d
- addb %al,%cl
- roll $10,%r11d
- movl $-1,%r12d
- pinsrw $6,(%rdi,%rbx,4),%xmm1
-
- addl %r8d,%r11d
- movl (%rdi,%rcx,4),%edx
- xorl %r9d,%r12d
- movl %eax,(%rdi,%rcx,4)
- orl %r11d,%r12d
- addl 8(%r15),%r10d
- addb %dl,%al
- movl 124(%rsi),%ebx
- addl $718787259,%r10d
- movzbl %al,%eax
- xorl %r8d,%r12d
- movl %edx,120(%rsi)
- addl %r12d,%r10d
- addb %bl,%cl
- roll $15,%r10d
- movl $-1,%r12d
- pinsrw $7,(%rdi,%rax,4),%xmm0
-
- addl %r11d,%r10d
- movdqu 48(%r13),%xmm5
- addb $32,%bpl
- movl (%rdi,%rcx,4),%edx
- xorl %r8d,%r12d
- movl %ebx,(%rdi,%rcx,4)
- orl %r10d,%r12d
- addl 36(%r15),%r9d
- addb %dl,%bl
- movl 0(%rdi,%rbp,4),%eax
- addl $3951481745,%r9d
- movzbl %bl,%ebx
- xorl %r11d,%r12d
- movl %edx,124(%rsi)
- addl %r12d,%r9d
- addb %al,%cl
- roll $21,%r9d
- movl $-1,%r12d
- pinsrw $7,(%rdi,%rbx,4),%xmm1
-
- addl %r10d,%r9d
- movq %rbp,%rsi
- xorq %rbp,%rbp
- movb %sil,%bpl
- movq %rcx,%rsi
- xorq %rcx,%rcx
- movb %sil,%cl
- leaq (%rdi,%rbp,4),%rsi
- psllq $8,%xmm1
- pxor %xmm0,%xmm5
- pxor %xmm1,%xmm5
- addl 0(%rsp),%r8d
- addl 4(%rsp),%r9d
- addl 8(%rsp),%r10d
- addl 12(%rsp),%r11d
-
- movdqu %xmm2,(%r14,%r13,1)
- movdqu %xmm3,16(%r14,%r13,1)
- movdqu %xmm4,32(%r14,%r13,1)
- movdqu %xmm5,48(%r14,%r13,1)
- leaq 64(%r15),%r15
- leaq 64(%r13),%r13
- cmpq 16(%rsp),%r15
- jb L$oop
-
- movq 24(%rsp),%r12
- subb %al,%cl
- movl %r8d,0(%r12)
- movl %r9d,4(%r12)
- movl %r10d,8(%r12)
- movl %r11d,12(%r12)
- subb $1,%bpl
- movl %ebp,-8(%rdi)
- movl %ecx,-4(%rdi)
-
- movq 40(%rsp),%r15
- movq 48(%rsp),%r14
- movq 56(%rsp),%r13
- movq 64(%rsp),%r12
- movq 72(%rsp),%rbp
- movq 80(%rsp),%rbx
- leaq 88(%rsp),%rsp
-L$epilogue:
-L$abort:
- .byte 0xf3,0xc3
-
-#endif
diff --git a/mac-x86_64/crypto/sha/sha1-x86_64.S b/mac-x86_64/crypto/sha/sha1-x86_64.S
index 044dc5b7..0509d451 100644
--- a/mac-x86_64/crypto/sha/sha1-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha1-x86_64.S
@@ -12,6 +12,11 @@ _sha1_block_data_order:
movl _OPENSSL_ia32cap_P+8(%rip),%r10d
testl $512,%r8d
jz L$ialu
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
jmp _ssse3_shortcut
.p2align 4
@@ -2407,6 +2412,1122 @@ L$done_ssse3:
L$epilogue_ssse3:
.byte 0xf3,0xc3
+
+.p2align 4
+sha1_block_data_order_avx:
+_avx_shortcut:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ movq %rax,%r14
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r11
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa -64(%r11),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp L$oop_avx
+.p2align 4
+L$oop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r11),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r11),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r11),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je L$done_avx
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa -64(%r11),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp L$oop_avx
+
+.p2align 4
+L$done_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ leaq (%r14),%rsi
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+L$epilogue_avx:
+ .byte 0xf3,0xc3
+
.p2align 6
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/mac-x86_64/crypto/sha/sha256-x86_64.S b/mac-x86_64/crypto/sha/sha256-x86_64.S
index da02d4c2..0146ff5c 100644
--- a/mac-x86_64/crypto/sha/sha256-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha256-x86_64.S
@@ -11,6 +11,11 @@ _sha256_block_data_order:
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je L$avx_shortcut
testl $512,%r10d
jnz L$ssse3_shortcut
pushq %rbx
@@ -2840,4 +2845,1061 @@ L$ssse3_00_47:
L$epilogue_ssse3:
.byte 0xf3,0xc3
+
+.p2align 6
+sha256_block_data_order_avx:
+L$avx_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+L$prologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp L$loop_avx
+.p2align 4
+L$loop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp L$avx_00_47
+
+.p2align 4
+L$avx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne L$avx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb L$loop_avx
+
+ movq 64+24(%rsp),%rsi
+ vzeroupper
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue_avx:
+ .byte 0xf3,0xc3
+
#endif
diff --git a/mac-x86_64/crypto/sha/sha512-x86_64.S b/mac-x86_64/crypto/sha/sha512-x86_64.S
index 2f5d9124..aeabd3f4 100644
--- a/mac-x86_64/crypto/sha/sha512-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha512-x86_64.S
@@ -7,6 +7,17 @@
.p2align 4
_sha512_block_data_order:
+ leaq _OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $2048,%r10d
+ jnz L$xop_shortcut
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je L$avx_shortcut
pushq %rbx
pushq %rbp
pushq %r12
@@ -1783,4 +1794,2234 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+
+.p2align 6
+sha512_block_data_order_xop:
+L$xop_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %r11,128+24(%rsp)
+L$prologue_xop:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp L$loop_xop
+.p2align 4
+L$loop_xop:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp L$xop_00_47
+
+.p2align 4
+L$xop_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm0,%xmm0
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,223,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm7,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm0,%xmm0
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm1,%xmm1
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,216,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm0,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm1,%xmm1
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm2,%xmm2
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,217,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm1,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm2,%xmm2
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm3,%xmm3
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,218,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm2,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm3,%xmm3
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm4,%xmm4
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,219,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm3,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm4,%xmm4
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm5,%xmm5
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,220,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm4,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm5,%xmm5
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm6,%xmm6
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,221,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm5,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm6,%xmm6
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm7,%xmm7
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,222,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm6,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm7,%xmm7
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne L$xop_00_47
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb L$loop_xop
+
+ movq 128+24(%rsp),%rsi
+ vzeroupper
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue_xop:
+ .byte 0xf3,0xc3
+
+
+.p2align 6
+sha512_block_data_order_avx:
+L$avx_shortcut:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movq %rsp,%r11
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %r11,128+24(%rsp)
+L$prologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp L$loop_avx
+.p2align 4
+L$loop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp L$avx_00_47
+
+.p2align 4
+L$avx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne L$avx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb L$loop_avx
+
+ movq 128+24(%rsp),%rsi
+ vzeroupper
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+L$epilogue_avx:
+ .byte 0xf3,0xc3
+
#endif