summaryrefslogtreecommitdiff
path: root/mac-x86_64/crypto
diff options
context:
space:
mode:
authorDavid Benjamin <davidben@google.com>2017-01-25 14:08:15 -0500
committerDavid Benjamin <davidben@google.com>2017-01-25 17:04:27 -0500
commitf31229be918beb36153746ca75f900569b57e30f (patch)
tree395fcdd018362039a8ecf63cac6f4723cc1e9964 /mac-x86_64/crypto
parentb0b45c63bbbf16b7f5ff3cbe3f1d0905108038aa (diff)
downloadboringssl-f31229be918beb36153746ca75f900569b57e30f.tar.gz
external/boringssl: Sync to 6d50f475e319de153a43e1dba5a1beca95948c63.
This includes the following changes: https://boringssl.googlesource.com/boringssl/+log/0726fb76ebe7f422e3c4fb2e25a0064926975770..6d50f475e319de153a43e1dba5a1beca95948c63 This also updates the build files to add the new GTest-based targets and work with the C++ file in libssl. Test: cts-tradefed run cts -m CtsLibcoreOkHttpTestCases -a arm64-v8a Test: cts-tradefed run cts -m CtsLibcoreTestCases -a arm64-v8a Change-Id: I99718d51c901fe2e2e1e0398fc61fe1e76ccdb3f
Diffstat (limited to 'mac-x86_64/crypto')
-rw-r--r--mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S8786
1 files changed, 8786 insertions, 0 deletions
diff --git a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
new file mode 100644
index 00000000..20a78386
--- /dev/null
+++ b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
@@ -0,0 +1,8786 @@
+#if defined(__x86_64__)
+.text
+
+.p2align 6
+.chacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.rol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.rol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.avx2_init:
+.long 0,0,0,0
+.sse_inc:
+.long 1,0,0,0
+.avx2_inc:
+.long 2,0,0,0,2,0,0,0
+.clamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.p2align 4
+.and_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+
+
+.p2align 6
+poly_hash_ad_internal:
+.cfi_startproc
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ cmpq $13,%r8
+ jne hash_ad_loop
+poly_fast_tls_ad:
+
+ movq (%rcx),%r10
+ movq 5(%rcx),%r11
+ shrq $24,%r11
+ movq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ .byte 0xf3,0xc3
+hash_ad_loop:
+
+ cmpq $16,%r8
+ jb hash_ad_tail
+ addq 0(%rcx),%r10
+ adcq 8+0(%rcx),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq (1*16)(%rcx),%rcx
+ subq $16,%r8
+ jmp hash_ad_loop
+hash_ad_tail:
+ cmpq $0,%r8
+ je 1f
+
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ addq %r8,%rcx
+hash_ad_tail_loop:
+ shldq $8,%r13,%r14
+ shlq $8,%r13
+ movzbq -1(%rcx),%r15
+ xorq %r15,%r13
+ decq %rcx
+ decq %r8
+ jne hash_ad_tail_loop
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+1:
+ .byte 0xf3,0xc3
+.cfi_endproc
+
+
+.globl _chacha20_poly1305_open
+.private_extern _chacha20_poly1305_open
+
+.p2align 6
+_chacha20_poly1305_open:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+
+
+ pushq %r9
+.cfi_adjust_cfa_offset 8
+ subq $288 + 32,%rsp
+.cfi_adjust_cfa_offset 288 + 32
+.cfi_offset rbp, -16
+.cfi_offset rbx, -24
+.cfi_offset r12, -32
+.cfi_offset r13, -40
+.cfi_offset r14, -48
+.cfi_offset r15, -56
+.cfi_offset %r9, -64
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+ movq %rdx,8+32(%rbp)
+ movq %r8,0+32(%rbp)
+ movq %rdx,%rbx
+
+ movl _OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_open_avx2
+
+1:
+ cmpq $128,%rbx
+ jbe open_sse_128
+
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+ movdqa %xmm12,%xmm7
+
+ movdqa %xmm4,48(%rbp)
+ movdqa %xmm8,64(%rbp)
+ movdqa %xmm12,96(%rbp)
+ movq $10,%r10
+1:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jne 1b
+
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+
+ pand .clamp(%rip),%xmm0
+ movdqa %xmm0,0(%rbp)
+ movdqa %xmm4,16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+open_sse_main_loop:
+ cmpq $256,%rbx
+ jb 2f
+
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 96(%rbp),%xmm15
+ paddd .sse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+ movdqa %xmm14,128(%rbp)
+ movdqa %xmm15,144(%rbp)
+
+
+
+ movq $4,%rcx
+ movq %rsi,%r8
+1:
+ movdqa %xmm8,80(%rbp)
+ movdqa .rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+
+ leaq 16(%r8),%r8
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa .rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 80(%rbp),%xmm8
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,80(%rbp)
+ movdqa .rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %rcx
+ jge 1b
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ cmpq $-6,%rcx
+ jg 1b
+ paddd .chacha20_consts(%rip),%xmm3
+ paddd 48(%rbp),%xmm7
+ paddd 64(%rbp),%xmm11
+ paddd 144(%rbp),%xmm15
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd 48(%rbp),%xmm6
+ paddd 64(%rbp),%xmm10
+ paddd 128(%rbp),%xmm14
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+ movdqa %xmm12,80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor 80(%rbp),%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp open_sse_main_loop
+2:
+
+ testq %rbx,%rbx
+ jz open_sse_finalize
+ cmpq $64,%rbx
+ ja 3f
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa 96(%rbp),%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ cmpq $16,%rcx
+ jb 2f
+1:
+ addq 0(%rsi,%r8), %r10
+ adcq 8+0(%rsi,%r8), %r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+2:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ cmpq $16,%rcx
+ jae 1b
+ cmpq $160,%r8
+ jne 2b
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+
+ jmp open_sse_tail_64_dec_loop
+3:
+ cmpq $128,%rbx
+ ja 3f
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 96(%rbp),%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+1:
+ addq 0(%rsi,%r8), %r10
+ adcq 8+0(%rsi,%r8), %r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+2:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ cmpq %rcx,%r8
+ jb 1b
+ cmpq $160,%r8
+ jne 2b
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jmp open_sse_tail_64_dec_loop
+3:
+ cmpq $192,%rbx
+ ja 3f
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 96(%rbp),%xmm14
+ paddd .sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+ movdqa %xmm14,128(%rbp)
+
+ movq %rbx,%rcx
+ movq $160,%r8
+ cmpq $160,%rcx
+ cmovgq %r8,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+1:
+ addq 0(%rsi,%r8), %r10
+ adcq 8+0(%rsi,%r8), %r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+2:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ cmpq %rcx,%r8
+ jb 1b
+ cmpq $160,%r8
+ jne 2b
+ cmpq $176,%rbx
+ jb 1f
+ addq 160(%rsi),%r10
+ adcq 8+160(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ cmpq $192,%rbx
+ jb 1f
+ addq 176(%rsi),%r10
+ adcq 8+176(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+1:
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd 48(%rbp),%xmm6
+ paddd 64(%rbp),%xmm10
+ paddd 128(%rbp),%xmm14
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ jmp open_sse_tail_64_dec_loop
+3:
+
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 96(%rbp),%xmm15
+ paddd .sse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+ movdqa %xmm14,128(%rbp)
+ movdqa %xmm15,144(%rbp)
+
+ xorq %r8,%r8
+1:
+ addq 0(%rsi,%r8), %r10
+ adcq 8+0(%rsi,%r8), %r11
+ adcq $1,%r12
+ movdqa %xmm11,80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ movdqa 80(%rbp),%xmm11
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa %xmm9,80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .rol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .rol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+ movdqa 80(%rbp),%xmm9
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ movdqa %xmm11,80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+ movdqa 80(%rbp),%xmm11
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ movdqa %xmm9,80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .rol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .rol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+ movdqa 80(%rbp),%xmm9
+
+ addq $16,%r8
+ cmpq $160,%r8
+ jb 1b
+ movq %rbx,%rcx
+ andq $-16,%rcx
+1:
+ addq 0(%rsi,%r8), %r10
+ adcq 8+0(%rsi,%r8), %r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%r8
+ cmpq %rcx,%r8
+ jb 1b
+ paddd .chacha20_consts(%rip),%xmm3
+ paddd 48(%rbp),%xmm7
+ paddd 64(%rbp),%xmm11
+ paddd 144(%rbp),%xmm15
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd 48(%rbp),%xmm6
+ paddd 64(%rbp),%xmm10
+ paddd 128(%rbp),%xmm14
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+ movdqa %xmm12,80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movdqa 80(%rbp),%xmm12
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ leaq 192(%rdi),%rdi
+
+
+open_sse_tail_64_dec_loop:
+ cmpq $16,%rbx
+ jb 1f
+ subq $16,%rbx
+ movdqu (%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ jmp open_sse_tail_64_dec_loop
+1:
+ movdqa %xmm0,%xmm1
+
+
+open_sse_tail_16:
+ testq %rbx,%rbx
+ jz open_sse_finalize
+
+
+
+ pxor %xmm3,%xmm3
+ leaq -1(%rsi,%rbx), %rsi
+ movq %rbx,%r8
+2:
+ pslldq $1,%xmm3
+ pinsrb $0,(%rsi),%xmm3
+ subq $1,%rsi
+ subq $1,%r8
+ jnz 2b
+
+3:
+.byte 102,73,15,126,221
+ pextrq $1,%xmm3,%r14
+
+ pxor %xmm1,%xmm3
+
+
+2:
+ pextrb $0,%xmm3,(%rdi)
+ psrldq $1,%xmm3
+ addq $1,%rdi
+ subq $1,%rbx
+ jne 2b
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+open_sse_finalize:
+ addq 32(%rbp),%r10
+ adcq 8+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+16(%rbp),%r10
+ adcq 8+16(%rbp),%r11
+
+ addq $288 + 32,%rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+ popq %r9
+.cfi_adjust_cfa_offset -8
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+
+ popq %r15
+.cfi_adjust_cfa_offset -8
+ popq %r14
+.cfi_adjust_cfa_offset -8
+ popq %r13
+.cfi_adjust_cfa_offset -8
+ popq %r12
+.cfi_adjust_cfa_offset -8
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+ .byte 0xf3,0xc3
+.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+
+open_sse_128:
+ movdqu .chacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm14
+ paddd .sse_inc(%rip),%xmm14
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm13,%xmm15
+ movq $10,%r10
+1:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz 1b
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm9
+ paddd %xmm11,%xmm10
+ paddd %xmm15,%xmm13
+ paddd .sse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm14
+
+ pand .clamp(%rip),%xmm0
+ movdqa %xmm0,0(%rbp)
+ movdqa %xmm4,16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+1:
+ cmpq $16,%rbx
+ jb open_sse_tail_16
+ subq $16,%rbx
+ addq 0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm1
+ movdqu %xmm1,0(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,%xmm13
+ movdqa %xmm6,%xmm2
+ movdqa %xmm10,%xmm6
+ movdqa %xmm14,%xmm10
+ jmp 1b
+ jmp open_sse_tail_16
+
+.cfi_endproc
+
+
+
+
+.globl _chacha20_poly1305_seal
+.private_extern _chacha20_poly1305_seal
+
+.p2align 6
+_chacha20_poly1305_seal:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+
+
+ pushq %r9
+.cfi_adjust_cfa_offset 8
+ subq $288 + 32,%rsp
+.cfi_adjust_cfa_offset 288 + 32
+.cfi_offset rbp, -16
+.cfi_offset rbx, -24
+.cfi_offset r12, -32
+.cfi_offset r13, -40
+.cfi_offset r14, -48
+.cfi_offset r15, -56
+.cfi_offset %r9, -64
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+ movq %rdx,8+32(%rbp)
+ movq %r8,0+32(%rbp)
+ movq %rdx,%rbx
+
+ movl _OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_seal_avx2
+
+ cmpq $128,%rbx
+ jbe seal_sse_128
+
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm14
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .sse_inc(%rip),%xmm12
+
+ movdqa %xmm4,48(%rbp)
+ movdqa %xmm8,64(%rbp)
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+ movdqa %xmm14,128(%rbp)
+ movdqa %xmm15,144(%rbp)
+ movq $10,%r10
+1:
+ movdqa %xmm8,80(%rbp)
+ movdqa .rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 80(%rbp),%xmm8
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,80(%rbp)
+ movdqa .rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jnz 1b
+ paddd .chacha20_consts(%rip),%xmm3
+ paddd 48(%rbp),%xmm7
+ paddd 64(%rbp),%xmm11
+ paddd 144(%rbp),%xmm15
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd 48(%rbp),%xmm6
+ paddd 64(%rbp),%xmm10
+ paddd 128(%rbp),%xmm14
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+
+
+ pand .clamp(%rip),%xmm3
+ movdqa %xmm3,0(%rbp)
+ movdqa %xmm7,16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ cmpq $192,%rbx
+ ja 1f
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ jmp seal_sse_128_seal_hash
+1:
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 128(%rdi)
+ movdqu %xmm4,16 + 128(%rdi)
+ movdqu %xmm8,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ movq $2,%rcx
+ movq $8,%r8
+ cmpq $64,%rbx
+ jbe seal_sse_tail_64
+ cmpq $128,%rbx
+ jbe seal_sse_tail_128
+ cmpq $192,%rbx
+ jbe seal_sse_tail_192
+
+1:
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 96(%rbp),%xmm15
+ paddd .sse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+ movdqa %xmm14,128(%rbp)
+ movdqa %xmm15,144(%rbp)
+
+2:
+ movdqa %xmm8,80(%rbp)
+ movdqa .rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa .rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 80(%rbp),%xmm8
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,80(%rbp)
+ movdqa .rol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .rol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ leaq 16(%rdi),%rdi
+ decq %r8
+ jge 2b
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg 2b
+ paddd .chacha20_consts(%rip),%xmm3
+ paddd 48(%rbp),%xmm7
+ paddd 64(%rbp),%xmm11
+ paddd 144(%rbp),%xmm15
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd 48(%rbp),%xmm6
+ paddd 64(%rbp),%xmm10
+ paddd 128(%rbp),%xmm14
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+
+ movdqa %xmm14,80(%rbp)
+ movdqa %xmm14,80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm14
+ pxor %xmm3,%xmm14
+ movdqu %xmm14,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm14
+ pxor %xmm7,%xmm14
+ movdqu %xmm14,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm14
+ pxor %xmm11,%xmm14
+ movdqu %xmm14,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm14
+ pxor %xmm15,%xmm14
+ movdqu %xmm14,48 + 0(%rdi)
+
+ movdqa 80(%rbp),%xmm14
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ cmpq $256,%rbx
+ ja 3f
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ jmp seal_sse_128_seal_hash
+3:
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ movq $6,%rcx
+ movq $4,%r8
+ cmpq $192,%rbx
+ jg 1b
+ movq %rbx,%rcx
+ testq %rbx,%rbx
+ je seal_sse_128_seal_hash
+ movq $6,%rcx
+ cmpq $64,%rbx
+ jg 3f
+
+seal_sse_tail_64:
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa 96(%rbp),%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+
+ jmp seal_sse_128_seal
+3:
+ cmpq $128,%rbx
+ jg 3f
+
+seal_sse_tail_128:
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 96(%rbp),%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ movq $64,%rcx
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ jmp seal_sse_128_seal_hash
+3:
+
+seal_sse_tail_192:
+ movdqa .chacha20_consts(%rip),%xmm0
+ movdqa 48(%rbp),%xmm4
+ movdqa 64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 96(%rbp),%xmm14
+ paddd .sse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,96(%rbp)
+ movdqa %xmm13,112(%rbp)
+ movdqa %xmm14,128(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd 48(%rbp),%xmm6
+ paddd 64(%rbp),%xmm10
+ paddd 128(%rbp),%xmm14
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd 48(%rbp),%xmm5
+ paddd 64(%rbp),%xmm9
+ paddd 112(%rbp),%xmm13
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd 48(%rbp),%xmm4
+ paddd 64(%rbp),%xmm8
+ paddd 96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+
+seal_sse_128_seal_hash:
+ cmpq $16,%rcx
+ jb seal_sse_128_seal
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ leaq 16(%rdi),%rdi
+ jmp seal_sse_128_seal_hash
+
+seal_sse_128_seal:
+ cmpq $16,%rbx
+ jb seal_sse_tail_16
+ subq $16,%rbx
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0(%rdi)
+
+ addq 0(%rdi),%r10
+ adcq 8(%rdi),%r11
+ adcq $1,%r12
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ jmp seal_sse_128_seal
+
+seal_sse_tail_16:
+ testq %rbx,%rbx
+ jz seal_sse_finalize
+
+ movq %rbx,%r8
+ shlq $4,%r8
+ leaq .and_masks(%rip),%r13
+ movq %rbx,%rcx
+ leaq -1(%rsi,%rbx), %rsi
+ pxor %xmm15,%xmm15
+1:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ decq %rcx
+ jne 1b
+
+
+ pxor %xmm0,%xmm15
+
+
+ movq %rbx,%rcx
+ movdqu %xmm15,%xmm0
+2:
+ pextrb $0,%xmm0,(%rdi)
+ psrldq $1,%xmm0
+ addq $1,%rdi
+ subq $1,%rcx
+ jnz 2b
+
+ pand -16(%r13,%r8), %xmm15
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+seal_sse_finalize:
+ addq 32(%rbp),%r10
+ adcq 8+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+16(%rbp),%r10
+ adcq 8+16(%rbp),%r11
+
+ addq $288 + 32,%rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+ popq %r9
+.cfi_adjust_cfa_offset -8
+ movq %r10,0(%r9)
+ movq %r11,8(%r9)
+
+ popq %r15
+.cfi_adjust_cfa_offset -8
+ popq %r14
+.cfi_adjust_cfa_offset -8
+ popq %r13
+.cfi_adjust_cfa_offset -8
+ popq %r12
+.cfi_adjust_cfa_offset -8
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+ .byte 0xf3,0xc3
+.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+
+seal_sse_128:
+ movdqu .chacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm14
+ movdqa %xmm14,%xmm12
+ paddd .sse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .sse_inc(%rip),%xmm13
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ movq $10,%r10
+1:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .rol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .rol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .rol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz 1b
+ paddd .chacha20_consts(%rip),%xmm0
+ paddd .chacha20_consts(%rip),%xmm1
+ paddd .chacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm8
+ paddd %xmm11,%xmm9
+ paddd %xmm15,%xmm12
+ paddd .sse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm13
+
+ pand .clamp(%rip),%xmm2
+ movdqa %xmm2,0(%rbp)
+ movdqa %xmm6,16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ jmp seal_sse_128_seal
+
+
+
+
+.p2align 6
+chacha20_poly1305_open_avx2:
+ vzeroupper
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd .avx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe open_avx2_192
+ cmpq $320,%rbx
+ jbe open_avx2_320
+
+ vmovdqa %ymm4,64(%rbp)
+ vmovdqa %ymm8,96(%rbp)
+ vmovdqa %ymm12,160(%rbp)
+ movq $10,%r10
+1:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ decq %r10
+ jne 1b
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ xorq %rcx,%rcx
+
+1:
+ addq 0(%rsi,%rcx), %r10
+ adcq 8+0(%rsi,%rcx), %r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%rcx
+ cmpq $64,%rcx
+ jne 1b
+
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm4,32(%rdi)
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ subq $64,%rbx
+1:
+
+ cmpq $512,%rbx
+ jb 3f
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,256(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm12,160(%rbp)
+
+ xorq %rcx,%rcx
+2:
+ addq 0*8(%rsi,%rcx), %r10
+ adcq 8+0*8(%rsi,%rcx), %r11
+ adcq $1,%r12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ addq 2*8(%rsi,%rcx), %r10
+ adcq 8+2*8(%rsi,%rcx), %r11
+ adcq $1,%r12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ addq 4*8(%rsi,%rcx), %r10
+ adcq 8+4*8(%rsi,%rcx), %r11
+ adcq $1,%r12
+
+ leaq 48(%rcx),%rcx
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ cmpq $60*8,%rcx
+ jne 2b
+ vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 64(%rbp),%ymm7,%ymm7
+ vpaddd 96(%rbp),%ymm11,%ymm11
+ vpaddd 256(%rbp),%ymm15,%ymm15
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,128(%rbp)
+ addq 60*8(%rsi),%r10
+ adcq 8+60*8(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 128(%rbp),%ymm0
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ addq 60*8+16(%rsi),%r10
+ adcq 8+60*8+16(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ leaq 512(%rdi),%rdi
+ subq $512,%rbx
+ jmp 1b
+3:
+ testq %rbx,%rbx
+ vzeroupper
+ je open_sse_finalize
+3:
+ cmpq $128,%rbx
+ ja 3f
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ testq %rcx,%rcx
+ je 2f
+1:
+ addq 0*8(%rsi,%r8), %r10
+ adcq 8+0*8(%rsi,%r8), %r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+2:
+ addq $16,%r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb 1b
+ cmpq $160,%r8
+ jne 2b
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp open_avx2_tail_loop
+3:
+ cmpq $256,%rbx
+ ja 3f
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+
+ movq %rbx,128(%rbp)
+ movq %rbx,%rcx
+ subq $128,%rcx
+ shrq $4,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+1:
+ addq 0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+2:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+
+ incq %r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ cmpq %rcx,%r8
+ jb 1b
+ cmpq $10,%r8
+ jne 2b
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 128(%rbp),%rbx
+1:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg 1f
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp 1b
+1:
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ subq $128,%rbx
+ jmp open_avx2_tail_loop
+3:
+ cmpq $384,%rbx
+ ja 3f
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+
+ movq %rbx,128(%rbp)
+ movq %rbx,%rcx
+ subq $256,%rcx
+ shrq $4,%rcx
+ addq $6,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+1:
+ addq 0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+2:
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+ incq %r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb 1b
+ cmpq $10,%r8
+ jne 2b
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 128(%rbp),%rbx
+1:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg 1f
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp 1b
+1:
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp open_avx2_tail_loop
+3:
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,256(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm12,160(%rbp)
+
+ xorq %rcx,%rcx
+ movq %rsi,%r8
+1:
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+2:
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ addq 16(%r8),%r10
+ adcq 8+16(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%r8),%r8
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ incq %rcx
+ cmpq $4,%rcx
+ jl 1b
+ cmpq $10,%rcx
+ jne 2b
+ movq %rbx,%rcx
+ subq $384,%rcx
+ andq $-16,%rcx
+1:
+ testq %rcx,%rcx
+ je 1f
+ addq 0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ subq $16,%rcx
+ jmp 1b
+1:
+ vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 64(%rbp),%ymm7,%ymm7
+ vpaddd 96(%rbp),%ymm11,%ymm11
+ vpaddd 256(%rbp),%ymm15,%ymm15
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 384(%rsi),%rsi
+ leaq 384(%rdi),%rdi
+ subq $384,%rbx
+open_avx2_tail_loop:
+ cmpq $32,%rbx
+ jb open_avx2_tail
+ subq $32,%rbx
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ jmp open_avx2_tail_loop
+open_avx2_tail:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb 1f
+ subq $16,%rbx
+
+ vpxor (%rsi),%xmm0,%xmm1
+ vmovdqu %xmm1,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
+ vmovdqa %xmm0,%xmm1
+1:
+ vzeroupper
+ jmp open_sse_tail_16
+
+open_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .avx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+1:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne 1b
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+open_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+open_avx2_hash_and_xor_loop:
+ cmpq $32,%rbx
+ jb open_avx2_short_tail_32
+ subq $32,%rbx
+ addq 0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 16(%rsi),%r10
+ adcq 8+16(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp open_avx2_hash_and_xor_loop
+open_avx2_short_tail_32:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb 1f
+ subq $16,%rbx
+ addq 0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm1
+1:
+ vzeroupper
+ jmp open_sse_tail_16
+
+open_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .avx2_inc(%rip),%ymm12,%ymm13
+ vpaddd .avx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ movq $10,%r10
+1:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne 1b
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp open_avx2_short
+
+
+
+
+.p2align 6
+chacha20_poly1305_seal_avx2:
+ vzeroupper
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd .avx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe seal_avx2_192
+ cmpq $320,%rbx
+ jbe seal_avx2_320
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm4,64(%rbp)
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm8,96(%rbp)
+ vmovdqa %ymm12,%ymm15
+ vpaddd .avx2_inc(%rip),%ymm15,%ymm14
+ vpaddd .avx2_inc(%rip),%ymm14,%ymm13
+ vpaddd .avx2_inc(%rip),%ymm13,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ vmovdqa %ymm15,256(%rbp)
+ movq $10,%r10
+1:
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %r10
+ jnz 1b
+ vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 64(%rbp),%ymm7,%ymm7
+ vpaddd 96(%rbp),%ymm11,%ymm11
+ vpaddd 256(%rbp),%ymm15,%ymm15
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
+ vpand .clamp(%rip),%ymm15,%ymm15
+ vmovdqa %ymm15,0(%rbp)
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+64(%rsi),%ymm15,%ymm15
+ vpxor 32+64(%rsi),%ymm2,%ymm2
+ vpxor 64+64(%rsi),%ymm6,%ymm6
+ vpxor 96+64(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm15,0+64(%rdi)
+ vmovdqu %ymm2,32+64(%rdi)
+ vmovdqu %ymm6,64+64(%rdi)
+ vmovdqu %ymm10,96+64(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+192(%rsi),%ymm15,%ymm15
+ vpxor 32+192(%rsi),%ymm1,%ymm1
+ vpxor 64+192(%rsi),%ymm5,%ymm5
+ vpxor 96+192(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm15,0+192(%rdi)
+ vmovdqu %ymm1,32+192(%rdi)
+ vmovdqu %ymm5,64+192(%rdi)
+ vmovdqu %ymm9,96+192(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm15,%ymm8
+
+ leaq 320(%rsi),%rsi
+ subq $320,%rbx
+ movq $320,%rcx
+ cmpq $128,%rbx
+ jbe seal_avx2_hash
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+ vpxor 64(%rsi),%ymm8,%ymm8
+ vpxor 96(%rsi),%ymm12,%ymm12
+ vmovdqu %ymm0,320(%rdi)
+ vmovdqu %ymm4,352(%rdi)
+ vmovdqu %ymm8,384(%rdi)
+ vmovdqu %ymm12,416(%rdi)
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ movq $8,%rcx
+ movq $2,%r8
+ cmpq $128,%rbx
+ jbe seal_avx2_tail_128
+ cmpq $256,%rbx
+ jbe seal_avx2_tail_256
+ cmpq $384,%rbx
+ jbe seal_avx2_tail_384
+ cmpq $512,%rbx
+ jbe seal_avx2_tail_512
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,256(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+
+ subq $16,%rdi
+ movq $9,%rcx
+ jmp 4f
+1:
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,256(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm12,160(%rbp)
+
+ movq $10,%rcx
+2:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+4:
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ addq 32(%rdi),%r10
+ adcq 8+32(%rdi),%r11
+ adcq $1,%r12
+
+ leaq 48(%rdi),%rdi
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %rcx
+ jne 2b
+ vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 64(%rbp),%ymm7,%ymm7
+ vpaddd 96(%rbp),%ymm11,%ymm11
+ vpaddd 256(%rbp),%ymm15,%ymm15
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+
+ leaq 32(%rdi),%rdi
+ vmovdqa %ymm0,128(%rbp)
+ addq -32(%rdi),%r10
+ adcq 8+-32(%rdi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 128(%rbp),%ymm0
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ addq -16(%rdi),%r10
+ adcq 8+-16(%rdi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ subq $512,%rbx
+ cmpq $512,%rbx
+ jg 1b
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ movq $10,%rcx
+ xorq %r8,%r8
+ cmpq $128,%rbx
+ ja 3f
+
+seal_avx2_tail_128:
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp seal_avx2_short_loop
+3:
+ cmpq $256,%rbx
+ ja 3f
+
+seal_avx2_tail_256:
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $128,%rcx
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ jmp seal_avx2_hash
+3:
+ cmpq $384,%rbx
+ ja seal_avx2_tail_512
+
+seal_avx2_tail_384:
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $256,%rcx
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ jmp seal_avx2_hash
+
+seal_avx2_tail_512:
+ vmovdqa .chacha20_consts(%rip),%ymm0
+ vmovdqa 64(%rbp),%ymm4
+ vmovdqa 96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .avx2_inc(%rip),%ymm12
+ vpaddd 160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,256(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm12,160(%rbp)
+
+1:
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+2:
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,128(%rbp)
+ vmovdqa .rol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vmovdqa .rol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vmovdqa 128(%rbp),%ymm8
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ movq 0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ movq 8+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imul %r12,%rdx
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+ addq %rax,%r15
+ adcq %rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg 1b
+ decq %r8
+ jge 2b
+ vpaddd .chacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 64(%rbp),%ymm7,%ymm7
+ vpaddd 96(%rbp),%ymm11,%ymm11
+ vpaddd 256(%rbp),%ymm15,%ymm15
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 64(%rbp),%ymm6,%ymm6
+ vpaddd 96(%rbp),%ymm10,%ymm10
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpaddd 96(%rbp),%ymm9,%ymm9
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 64(%rbp),%ymm4,%ymm4
+ vpaddd 96(%rbp),%ymm8,%ymm8
+ vpaddd 160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $384,%rcx
+ leaq 384(%rsi),%rsi
+ subq $384,%rbx
+ jmp seal_avx2_hash
+
+seal_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .avx2_inc(%rip),%ymm12,%ymm13
+ vpaddd .avx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,160(%rbp)
+ vmovdqa %ymm13,192(%rbp)
+ vmovdqa %ymm14,224(%rbp)
+ movq $10,%r10
+1:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .rol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne 1b
+ vpaddd .chacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd .chacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd .chacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 160(%rbp),%ymm12,%ymm12
+ vpaddd 192(%rbp),%ymm13,%ymm13
+ vpaddd 224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp seal_avx2_short
+
+seal_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .avx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+1:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .rol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .rol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne 1b
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .clamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+seal_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ xorq %rcx,%rcx
+seal_avx2_hash:
+ cmpq $16,%rcx
+ jb seal_avx2_short_loop
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ addq $16,%rdi
+ jmp seal_avx2_hash
+seal_avx2_short_loop:
+ cmpq $32,%rbx
+ jb seal_avx2_short_tail
+ subq $32,%rbx
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp seal_avx2_short_loop
+seal_avx2_short_tail:
+ cmpq $16,%rbx
+ jb 1f
+ subq $16,%rbx
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ addq 0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0(%rbp),%rax
+ mulq %r11
+ imul %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imul %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $0,%r12
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm0
+1:
+ vzeroupper
+ jmp seal_sse_tail_16
+.cfi_endproc
+#endif