From f31229be918beb36153746ca75f900569b57e30f Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 25 Jan 2017 14:08:15 -0500 Subject: external/boringssl: Sync to 6d50f475e319de153a43e1dba5a1beca95948c63. This includes the following changes: https://boringssl.googlesource.com/boringssl/+log/0726fb76ebe7f422e3c4fb2e25a0064926975770..6d50f475e319de153a43e1dba5a1beca95948c63 This also updates the build files to add the new GTest-based targets and work with the C++ file in libssl. Test: cts-tradefed run cts -m CtsLibcoreOkHttpTestCases -a arm64-v8a Test: cts-tradefed run cts -m CtsLibcoreTestCases -a arm64-v8a Change-Id: I99718d51c901fe2e2e1e0398fc61fe1e76ccdb3f --- .../crypto/cipher/chacha20_poly1305_x86_64.S | 8786 ++++++++++++++++++++ 1 file changed, 8786 insertions(+) create mode 100644 mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S (limited to 'mac-x86_64/crypto') diff --git a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S new file mode 100644 index 00000000..20a78386 --- /dev/null +++ b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S @@ -0,0 +1,8786 @@ +#if defined(__x86_64__) +.text + +.p2align 6 +.chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.avx2_init: +.long 0,0,0,0 +.sse_inc: +.long 1,0,0,0 +.avx2_inc: +.long 2,0,0,0,2,0,0,0 +.clamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.p2align 4 +.and_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 + + +.p2align 6 +poly_hash_ad_internal: +.cfi_startproc + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne hash_ad_loop +poly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + .byte 0xf3,0xc3 +hash_ad_loop: + + cmpq $16,%r8 + jb hash_ad_tail + addq 0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq (1*16)(%rcx),%rcx + subq $16,%r8 + jmp hash_ad_loop +hash_ad_tail: + cmpq $0,%r8 + je 1f + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +hash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne hash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +1: + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _chacha20_poly1305_open +.private_extern _chacha20_poly1305_open + +.p2align 6 +_chacha20_poly1305_open: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 + pushq %rbx +.cfi_adjust_cfa_offset 8 + pushq %r12 +.cfi_adjust_cfa_offset 8 + pushq %r13 +.cfi_adjust_cfa_offset 8 + pushq %r14 +.cfi_adjust_cfa_offset 8 + pushq %r15 +.cfi_adjust_cfa_offset 8 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 + subq $288 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 +.cfi_offset rbp, -16 +.cfi_offset rbx, -24 +.cfi_offset r12, -32 +.cfi_offset r13, -40 +.cfi_offset r14, -48 +.cfi_offset r15, -56 +.cfi_offset %r9, -64 + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq %rdx,8+32(%rbp) + movq %r8,0+32(%rbp) + movq %rdx,%rbx + + movl _OPENSSL_ia32cap_P+8(%rip),%eax + andl $288,%eax + xorl $288,%eax + jz chacha20_poly1305_open_avx2 + +1: + cmpq $128,%rbx + jbe open_sse_128 + + movdqa .chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm7 + + movdqa %xmm4,48(%rbp) + movdqa %xmm8,64(%rbp) + movdqa %xmm12,96(%rbp) + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jne 1b + + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + + pand .clamp(%rip),%xmm0 + movdqa %xmm0,0(%rbp) + movdqa %xmm4,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +open_sse_main_loop: + cmpq $256,%rbx + jb 2f + + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +1: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %rcx + jge 1b + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqa %xmm12,80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp open_sse_main_loop +2: + + testq %rbx,%rbx + jz open_sse_finalize + cmpq $64,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa 96(%rbp),%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb 2f +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + cmpq $16,%rcx + jae 1b + cmpq $160,%r8 + jne 2b + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + jmp open_sse_tail_64_dec_loop +3: + cmpq $128,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 96(%rbp),%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp open_sse_tail_64_dec_loop +3: + cmpq $192,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 96(%rbp),%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + cmpq $176,%rbx + jb 1f + addq 160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb 1f + addq 176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +1: + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp open_sse_tail_64_dec_loop +3: + + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movdqa %xmm11,80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + movdqa 80(%rbp),%xmm11 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 + movdqa 80(%rbp),%xmm9 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + movdqa 80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 + movdqa 80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb 1b + movq %rbx,%rcx + andq $-16,%rcx +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqa %xmm12,80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +open_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb 1f + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp open_sse_tail_64_dec_loop +1: + movdqa %xmm0,%xmm1 + + +open_sse_tail_16: + testq %rbx,%rbx + jz open_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx), %rsi + movq %rbx,%r8 +2: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz 2b + +3: +.byte 102,73,15,126,221 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +2: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne 2b + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +open_sse_finalize: + addq 32(%rbp),%r10 + adcq 8+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+16(%rbp),%r10 + adcq 8+16(%rbp),%r11 + + addq $288 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + popq %r9 +.cfi_adjust_cfa_offset -8 + movq %r10,(%r9) + movq %r11,8(%r9) + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + popq %rbx +.cfi_adjust_cfa_offset -8 + popq %rbp +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 + +open_sse_128: + movdqu .chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm0 + paddd .chacha20_consts(%rip),%xmm1 + paddd .chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd .sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand .clamp(%rip),%xmm0 + movdqa %xmm0,0(%rbp) + movdqa %xmm4,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +1: + cmpq $16,%rbx + jb open_sse_tail_16 + subq $16,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp 1b + jmp open_sse_tail_16 + +.cfi_endproc + + + + +.globl _chacha20_poly1305_seal +.private_extern _chacha20_poly1305_seal + +.p2align 6 +_chacha20_poly1305_seal: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 + pushq %rbx +.cfi_adjust_cfa_offset 8 + pushq %r12 +.cfi_adjust_cfa_offset 8 + pushq %r13 +.cfi_adjust_cfa_offset 8 + pushq %r14 +.cfi_adjust_cfa_offset 8 + pushq %r15 +.cfi_adjust_cfa_offset 8 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 + subq $288 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 +.cfi_offset rbp, -16 +.cfi_offset rbx, -24 +.cfi_offset r12, -32 +.cfi_offset r13, -40 +.cfi_offset r14, -48 +.cfi_offset r15, -56 +.cfi_offset %r9, -64 + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq %rdx,8+32(%rbp) + movq %r8,0+32(%rbp) + movq %rdx,%rbx + + movl _OPENSSL_ia32cap_P+8(%rip),%eax + andl $288,%eax + xorl $288,%eax + jz chacha20_poly1305_seal_avx2 + + cmpq $128,%rbx + jbe seal_sse_128 + + movdqa .chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm12 + + movdqa %xmm4,48(%rbp) + movdqa %xmm8,64(%rbp) + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + movq $10,%r10 +1: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + + pand .clamp(%rip),%xmm3 + movdqa %xmm3,0(%rbp) + movdqa %xmm7,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja 1f + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp seal_sse_128_seal_hash +1: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe seal_sse_tail_64 + cmpq $128,%rbx + jbe seal_sse_tail_128 + cmpq $192,%rbx + jbe seal_sse_tail_192 + +1: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + +2: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + leaq 16(%rdi),%rdi + decq %r8 + jge 2b + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg 2b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + movdqa %xmm14,80(%rbp) + movdqa %xmm14,80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja 3f + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp seal_sse_128_seal_hash +3: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg 1b + movq %rbx,%rcx + testq %rbx,%rbx + je seal_sse_128_seal_hash + movq $6,%rcx + cmpq $64,%rbx + jg 3f + +seal_sse_tail_64: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa 96(%rbp),%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + jmp seal_sse_128_seal +3: + cmpq $128,%rbx + jg 3f + +seal_sse_tail_128: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 96(%rbp),%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp seal_sse_128_seal_hash +3: + +seal_sse_tail_192: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 96(%rbp),%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +seal_sse_128_seal_hash: + cmpq $16,%rcx + jb seal_sse_128_seal + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp seal_sse_128_seal_hash + +seal_sse_128_seal: + cmpq $16,%rbx + jb seal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp seal_sse_128_seal + +seal_sse_tail_16: + testq %rbx,%rbx + jz seal_sse_finalize + + movq %rbx,%r8 + shlq $4,%r8 + leaq .and_masks(%rip),%r13 + movq %rbx,%rcx + leaq -1(%rsi,%rbx), %rsi + pxor %xmm15,%xmm15 +1: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne 1b + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +2: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz 2b + + pand -16(%r13,%r8), %xmm15 +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +seal_sse_finalize: + addq 32(%rbp),%r10 + adcq 8+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+16(%rbp),%r10 + adcq 8+16(%rbp),%r11 + + addq $288 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + popq %r9 +.cfi_adjust_cfa_offset -8 + movq %r10,0(%r9) + movq %r11,8(%r9) + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + popq %rbx +.cfi_adjust_cfa_offset -8 + popq %rbp +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 + +seal_sse_128: + movdqu .chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm0 + paddd .chacha20_consts(%rip),%xmm1 + paddd .chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd .sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand .clamp(%rip),%xmm2 + movdqa %xmm2,0(%rbp) + movdqa %xmm6,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp seal_sse_128_seal + + + + +.p2align 6 +chacha20_poly1305_open_avx2: + vzeroupper + vmovdqa .chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe open_avx2_192 + cmpq $320,%rbx + jbe open_avx2_320 + + vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm12,160(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx + +1: + addq 0(%rsi,%rcx), %r10 + adcq 8+0(%rsi,%rcx), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne 1b + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +1: + + cmpq $512,%rbx + jb 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + xorq %rcx,%rcx +2: + addq 0*8(%rsi,%rcx), %r10 + adcq 8+0*8(%rsi,%rcx), %r11 + adcq $1,%r12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + addq %rax,%r15 + adcq %rdx,%r9 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + addq 2*8(%rsi,%rcx), %r10 + adcq 8+2*8(%rsi,%rcx), %r11 + adcq $1,%r12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + addq 4*8(%rsi,%rcx), %r10 + adcq 8+4*8(%rsi,%rcx), %r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + addq 60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp 1b +3: + testq %rbx,%rbx + vzeroupper + je open_sse_finalize +3: + cmpq $128,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je 2f +1: + addq 0*8(%rsi,%r8), %r10 + adcq 8+0*8(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp open_avx2_tail_loop +3: + cmpq $256,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + + movq %rbx,128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +1: + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb 1b + cmpq $10,%r8 + jne 2b + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 128(%rbp),%rbx +1: + addq $16,%rcx + cmpq %rbx,%rcx + jg 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp open_avx2_tail_loop +3: + cmpq $384,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + + movq %rbx,128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +1: + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +2: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb 1b + cmpq $10,%r8 + jne 2b + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 128(%rbp),%rbx +1: + addq $16,%rcx + cmpq %rbx,%rcx + jg 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp open_avx2_tail_loop +3: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +1: + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +2: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + addq 16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl 1b + cmpq $10,%rcx + jne 2b + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +1: + testq %rcx,%rcx + je 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +open_avx2_tail_loop: + cmpq $32,%rbx + jb open_avx2_tail + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp open_avx2_tail_loop +open_avx2_tail: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb 1f + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +1: + vzeroupper + jmp open_sse_tail_16 + +open_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne 1b + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +open_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +open_avx2_hash_and_xor_loop: + cmpq $32,%rbx + jb open_avx2_short_tail_32 + subq $32,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp open_avx2_hash_and_xor_loop +open_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb 1f + subq $16,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +1: + vzeroupper + jmp open_sse_tail_16 + +open_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 160(%rbp),%ymm12,%ymm12 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd 224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp open_avx2_short + + + + +.p2align 6 +chacha20_poly1305_seal_avx2: + vzeroupper + vmovdqa .chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe seal_avx2_192 + cmpq $320,%rbx + jbe seal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd .avx2_inc(%rip),%ymm15,%ymm14 + vpaddd .avx2_inc(%rip),%ymm14,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm15,256(%rbp) + movq $10,%r10 +1: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz 1b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand .clamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe seal_avx2_hash + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe seal_avx2_tail_128 + cmpq $256,%rbx + jbe seal_avx2_tail_256 + cmpq $384,%rbx + jbe seal_avx2_tail_384 + cmpq $512,%rbx + jbe seal_avx2_tail_512 + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + + subq $16,%rdi + movq $9,%rcx + jmp 4f +1: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + movq $10,%rcx +2: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + addq %rax,%r15 + adcq %rdx,%r9 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +4: + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + addq 32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + leaq 32(%rdi),%rdi + vmovdqa %ymm0,128(%rbp) + addq -32(%rdi),%r10 + adcq 8+-32(%rdi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq -16(%rdi),%r10 + adcq 8+-16(%rdi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg 1b + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + cmpq $128,%rbx + ja 3f + +seal_avx2_tail_128: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp seal_avx2_short_loop +3: + cmpq $256,%rbx + ja 3f + +seal_avx2_tail_256: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp seal_avx2_hash +3: + cmpq $384,%rbx + ja seal_avx2_tail_512 + +seal_avx2_tail_384: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp seal_avx2_hash + +seal_avx2_tail_512: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imul %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp seal_avx2_hash + +seal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 160(%rbp),%ymm12,%ymm12 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd 224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp seal_avx2_short + +seal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne 1b + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +seal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +seal_avx2_hash: + cmpq $16,%rcx + jb seal_avx2_short_loop + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp seal_avx2_hash +seal_avx2_short_loop: + cmpq $32,%rbx + jb seal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp seal_avx2_short_loop +seal_avx2_short_tail: + cmpq $16,%rbx + jb 1f + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imul %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imul %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +1: + vzeroupper + jmp seal_sse_tail_16 +.cfi_endproc +#endif -- cgit v1.2.3