summaryrefslogtreecommitdiff
path: root/linux-x86_64
diff options
context:
space:
mode:
authorDaulet Zhanguzin <dauletz@google.com>2020-08-12 12:46:27 +0100
committerDaulet Zhanguzin <dauletz@google.com>2020-08-24 12:46:36 +0100
commitc960c43412e0632abb712fc465e70b6dfa2e9657 (patch)
treec48455b0f878c45496c3390cf2e88b413dc85a34 /linux-x86_64
parentba9db8781ec11859132f543aebff5d6093214751 (diff)
downloadboringssl-c960c43412e0632abb712fc465e70b6dfa2e9657.tar.gz
external/boringssl: Sync to a0b49d63fdc33e54eac93674c86891d15d181d87.
This includes the following changes: https://boringssl.googlesource.com/boringssl/+log/2fb729d4f36beaf263ad85e24a790b571652679c..a0b49d63fdc33e54eac93674c86891d15d181d87 Test: atest CtsLibcoreTestCases Change-Id: Ida4794d56d237422351b9ddcc7d0bc9295e816e1
Diffstat (limited to 'linux-x86_64')
-rw-r--r--linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S2
-rw-r--r--linux-x86_64/crypto/fipsmodule/ghash-x86_64.S2
-rw-r--r--linux-x86_64/crypto/fipsmodule/sha1-x86_64.S1867
3 files changed, 1868 insertions, 3 deletions
diff --git a/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
index e3133488..aefa5432 100644
--- a/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
+++ b/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
@@ -3935,7 +3935,7 @@ do_length_block:
popq %rbp
.cfi_adjust_cfa_offset -8
.byte 0xf3,0xc3
-.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+.cfi_adjust_cfa_offset (8 * 7) + 288 + 32
seal_sse_128:
movdqu .chacha20_consts(%rip),%xmm0
diff --git a/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
index 8fff7eb0..3eb1af43 100644
--- a/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
@@ -1119,8 +1119,6 @@ gcm_ghash_avx:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.L7_mask:
.long 7,0,7,0
-.L7_mask_poly:
-.long 7,0,450,0
.align 64
.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
index a4ce81ff..964687dc 100644
--- a/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
@@ -27,6 +27,11 @@ sha1_block_data_order:
movl 8(%r10),%r10d
testl $512,%r8d
jz .Lialu
+ testl $536870912,%r10d
+ jnz _shaext_shortcut
+ andl $296,%r10d
+ cmpl $296,%r10d
+ je _avx2_shortcut
andl $268435456,%r8d
andl $1073741824,%r9d
orl %r9d,%r8d
@@ -1266,6 +1271,175 @@ sha1_block_data_order:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_block_data_order,.-sha1_block_data_order
+.type sha1_block_data_order_shaext,@function
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%r8
+ paddd %xmm4,%xmm1
+ cmovneq %r8,%rsi
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz .Loop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
.type sha1_block_data_order_ssse3,@function
.align 16
sha1_block_data_order_ssse3:
@@ -3582,6 +3756,1699 @@ _avx_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp .Lalign32_1
+.align 32
+.Lalign32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp .Lalign32_2
+.align 32
+.Lalign32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je .Ldone_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp .Lalign32_3
+.align 32
+.Lalign32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999