diff options
Diffstat (limited to 'third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S')
-rw-r--r-- | third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S | 3869 |
1 files changed, 1881 insertions, 1988 deletions
diff --git a/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S index e50227ae..0f5cb550 100644 --- a/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ b/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -17,26 +17,26 @@ chacha20_poly1305_constants: .p2align 6 -.chacha20_consts: +L$chacha20_consts: .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -.rol8: +L$rol8: .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -.rol16: +L$rol16: .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -.avx2_init: +L$avx2_init: .long 0,0,0,0 -.sse_inc: +L$sse_inc: .long 1,0,0,0 -.avx2_inc: +L$avx2_inc: .long 2,0,0,0,2,0,0,0 -.clamp: +L$clamp: .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF .p2align 4 -.and_masks: +L$and_masks: .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 @@ -58,34 +58,35 @@ chacha20_poly1305_constants: .p2align 6 poly_hash_ad_internal: + xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 cmpq $13,%r8 - jne hash_ad_loop -poly_fast_tls_ad: + jne L$hash_ad_loop +L$poly_fast_tls_ad: movq (%rcx),%r10 movq 5(%rcx),%r11 shrq $24,%r11 movq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -101,38 +102,37 @@ poly_fast_tls_ad: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 .byte 0xf3,0xc3 -hash_ad_loop: +L$hash_ad_loop: cmpq $16,%r8 - jb hash_ad_tail - addq 0(%rcx),%r10 + jb L$hash_ad_tail + addq 0+0(%rcx),%r10 adcq 8+0(%rcx),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -148,53 +148,52 @@ hash_ad_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rcx),%rcx subq $16,%r8 - jmp hash_ad_loop -hash_ad_tail: + jmp L$hash_ad_loop +L$hash_ad_tail: cmpq $0,%r8 - je 1f + je L$hash_ad_done xorq %r13,%r13 xorq %r14,%r14 xorq %r15,%r15 addq %r8,%rcx -hash_ad_tail_loop: +L$hash_ad_tail_loop: shldq $8,%r13,%r14 shlq $8,%r13 movzbq -1(%rcx),%r15 xorq %r15,%r13 decq %rcx decq %r8 - jne hash_ad_tail_loop + jne L$hash_ad_tail_loop addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -210,15 +209,14 @@ hash_ad_tail_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -1: +L$hash_ad_done: .byte 0xf3,0xc3 @@ -245,43 +243,39 @@ _chacha20_poly1305_open: pushq %r9 - subq $288 + 32,%rsp - - - - - + subq $288 + 0 + 32,%rsp leaq 32(%rsp),%rbp andq $-32,%rbp - movq %rdx,8+32(%rbp) - movq %r8,0+32(%rbp) + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) movl _OPENSSL_ia32cap_P+8(%rip),%eax andl $288,%eax xorl $288,%eax jz chacha20_poly1305_open_avx2 -1: cmpq $128,%rbx - jbe open_sse_128 + jbe L$open_sse_128 - movdqa .chacha20_consts(%rip),%xmm0 + movdqa L$chacha20_consts(%rip),%xmm0 movdqu 0(%r9),%xmm4 movdqu 16(%r9),%xmm8 movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm7 - movdqa %xmm4,48(%rbp) - movdqa %xmm8,64(%rbp) - movdqa %xmm12,96(%rbp) + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) movq $10,%r10 -1: +L$open_sse_init_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -290,7 +284,7 @@ _chacha20_poly1305_open: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -302,7 +296,7 @@ _chacha20_poly1305_open: .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -311,7 +305,7 @@ _chacha20_poly1305_open: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -323,24 +317,24 @@ _chacha20_poly1305_open: .byte 102,69,15,58,15,228,4 decq %r10 - jne 1b + jne L$open_sse_init_rounds - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 - pand .clamp(%rip),%xmm0 - movdqa %xmm0,0(%rbp) - movdqa %xmm4,16(%rbp) + pand L$clamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal -open_sse_main_loop: +L$open_sse_main_loop: cmpq $256,%rbx - jb 2f + jb L$open_sse_tail - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 @@ -350,26 +344,26 @@ open_sse_main_loop: movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 - movdqa 96(%rbp),%xmm15 - paddd .sse_inc(%rip),%xmm15 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 - paddd .sse_inc(%rip),%xmm14 + paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) - movdqa %xmm14,128(%rbp) - movdqa %xmm15,144(%rbp) + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) movq $4,%rcx movq %rsi,%r8 -1: - movdqa %xmm8,80(%rbp) - movdqa .rol16(%rip),%xmm8 +L$open_sse_main_loop_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -382,13 +376,13 @@ open_sse_main_loop: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 - addq 0(%r8),%r10 + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 @@ -396,7 +390,7 @@ open_sse_main_loop: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 @@ -413,17 +407,17 @@ open_sse_main_loop: psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movdqa .rol8(%rip),%xmm8 + movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -436,26 +430,26 @@ open_sse_main_loop: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 @@ -472,7 +466,7 @@ open_sse_main_loop: psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 @@ -488,8 +482,8 @@ open_sse_main_loop: .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 - movdqa %xmm8,80(%rbp) - movdqa .rol16(%rip),%xmm8 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -505,9 +499,8 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -517,7 +510,7 @@ open_sse_main_loop: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -526,7 +519,7 @@ open_sse_main_loop: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 @@ -543,7 +536,7 @@ open_sse_main_loop: psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 - movdqa .rol8(%rip),%xmm8 + movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -556,7 +549,7 @@ open_sse_main_loop: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -565,7 +558,7 @@ open_sse_main_loop: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 @@ -582,7 +575,7 @@ open_sse_main_loop: psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 @@ -597,27 +590,27 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,4 decq %rcx - jge 1b - addq 0(%r8),%r10 + jge L$open_sse_main_loop_rounds + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -633,33 +626,32 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 cmpq $-6,%rcx - jg 1b - paddd .chacha20_consts(%rip),%xmm3 - paddd 48(%rbp),%xmm7 - paddd 64(%rbp),%xmm11 - paddd 144(%rbp),%xmm15 - paddd .chacha20_consts(%rip),%xmm2 - paddd 48(%rbp),%xmm6 - paddd 64(%rbp),%xmm10 - paddd 128(%rbp),%xmm14 - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 - movdqa %xmm12,80(%rbp) + jg L$open_sse_main_loop_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm12 pxor %xmm3,%xmm12 movdqu %xmm12,0 + 0(%rdi) @@ -703,7 +695,7 @@ open_sse_main_loop: pxor %xmm3,%xmm0 pxor %xmm7,%xmm4 pxor %xmm11,%xmm8 - pxor 80(%rbp),%xmm15 + pxor 0+80(%rbp),%xmm15 movdqu %xmm0,0 + 192(%rdi) movdqu %xmm4,16 + 192(%rdi) movdqu %xmm8,32 + 192(%rdi) @@ -712,45 +704,49 @@ open_sse_main_loop: leaq 256(%rsi),%rsi leaq 256(%rdi),%rdi subq $256,%rbx - jmp open_sse_main_loop -2: + jmp L$open_sse_main_loop +L$open_sse_tail: testq %rbx,%rbx - jz open_sse_finalize + jz L$open_sse_finalize + cmpq $192,%rbx + ja L$open_sse_tail_256 + cmpq $128,%rbx + ja L$open_sse_tail_192 cmpq $64,%rbx - ja 3f - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 - movdqa 96(%rbp),%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) + ja L$open_sse_tail_128 + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) xorq %r8,%r8 movq %rbx,%rcx cmpq $16,%rcx - jb 2f -1: - addq 0(%rsi,%r8), %r10 - adcq 8+0(%rsi,%r8), %r11 + jb L$open_sse_tail_64_rounds +L$open_sse_tail_64_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -766,19 +762,18 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx -2: +L$open_sse_tail_64_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -787,7 +782,7 @@ open_sse_main_loop: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -799,7 +794,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -808,7 +803,7 @@ open_sse_main_loop: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -820,55 +815,54 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,4 cmpq $16,%rcx - jae 1b + jae L$open_sse_tail_64_rounds_and_x1hash cmpq $160,%r8 - jne 2b - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 - - jmp open_sse_tail_64_dec_loop -3: - cmpq $128,%rbx - ja 3f - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 + jne L$open_sse_tail_64_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_128: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 - movdqa 96(%rbp),%xmm13 - paddd .sse_inc(%rip),%xmm13 + movdqa 0+96(%rbp),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) movq %rbx,%rcx andq $-16,%rcx xorq %r8,%r8 -1: - addq 0(%rsi,%r8), %r10 - adcq 8+0(%rsi,%r8), %r11 +L$open_sse_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -884,18 +878,17 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -2: +L$open_sse_tail_128_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -904,7 +897,7 @@ open_sse_main_loop: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -916,7 +909,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -925,7 +918,7 @@ open_sse_main_loop: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -937,7 +930,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,237,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -946,7 +939,7 @@ open_sse_main_loop: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -958,7 +951,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -967,7 +960,7 @@ open_sse_main_loop: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -979,17 +972,17 @@ open_sse_main_loop: .byte 102,69,15,58,15,237,4 cmpq %rcx,%r8 - jb 1b + jb L$open_sse_tail_128_rounds_and_x1hash cmpq $160,%r8 - jne 2b - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 + jne L$open_sse_tail_128_rounds + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 @@ -1006,28 +999,27 @@ open_sse_main_loop: subq $64,%rbx leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi - jmp open_sse_tail_64_dec_loop -3: - cmpq $192,%rbx - ja 3f - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_192: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 - movdqa 96(%rbp),%xmm14 - paddd .sse_inc(%rip),%xmm14 + movdqa 0+96(%rbp),%xmm14 + paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) - movdqa %xmm14,128(%rbp) + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) movq %rbx,%rcx movq $160,%r8 @@ -1035,27 +1027,27 @@ open_sse_main_loop: cmovgq %r8,%rcx andq $-16,%rcx xorq %r8,%r8 -1: - addq 0(%rsi,%r8), %r10 - adcq 8+0(%rsi,%r8), %r11 +L$open_sse_tail_192_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -1071,18 +1063,17 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -2: +L$open_sse_tail_192_rounds: addq $16,%r8 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1091,7 +1082,7 @@ open_sse_main_loop: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1103,7 +1094,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1112,7 +1103,7 @@ open_sse_main_loop: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1124,7 +1115,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -1133,7 +1124,7 @@ open_sse_main_loop: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -1145,7 +1136,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1154,7 +1145,7 @@ open_sse_main_loop: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1166,7 +1157,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1175,7 +1166,7 @@ open_sse_main_loop: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1187,7 +1178,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -1196,7 +1187,7 @@ open_sse_main_loop: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -1208,31 +1199,31 @@ open_sse_main_loop: .byte 102,69,15,58,15,246,4 cmpq %rcx,%r8 - jb 1b + jb L$open_sse_tail_192_rounds_and_x1hash cmpq $160,%r8 - jne 2b + jne L$open_sse_tail_192_rounds cmpq $176,%rbx - jb 1f - addq 160(%rsi),%r10 + jb L$open_sse_tail_192_finish + addq 0+160(%rsi),%r10 adcq 8+160(%rsi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -1248,35 +1239,34 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 cmpq $192,%rbx - jb 1f - addq 176(%rsi),%r10 + jb L$open_sse_tail_192_finish + addq 0+176(%rsi),%r10 adcq 8+176(%rsi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -1292,26 +1282,25 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -1: - paddd .chacha20_consts(%rip),%xmm2 - paddd 48(%rbp),%xmm6 - paddd 64(%rbp),%xmm10 - paddd 128(%rbp),%xmm14 - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 +L$open_sse_tail_192_finish: + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 @@ -1340,12 +1329,12 @@ open_sse_main_loop: subq $128,%rbx leaq 128(%rsi),%rsi leaq 128(%rdi),%rdi - jmp open_sse_tail_64_dec_loop -3: + jmp L$open_sse_tail_64_dec_loop - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 +L$open_sse_tail_256: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 @@ -1355,28 +1344,28 @@ open_sse_main_loop: movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 - movdqa 96(%rbp),%xmm15 - paddd .sse_inc(%rip),%xmm15 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 - paddd .sse_inc(%rip),%xmm14 + paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) - movdqa %xmm14,128(%rbp) - movdqa %xmm15,144(%rbp) + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) xorq %r8,%r8 -1: - addq 0(%rsi,%r8), %r10 - adcq 8+0(%rsi,%r8), %r11 +L$open_sse_tail_256_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 - movdqa %xmm11,80(%rbp) + movdqa %xmm11,0+80(%rbp) paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 @@ -1385,7 +1374,7 @@ open_sse_main_loop: pxor %xmm11,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 @@ -1397,7 +1386,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 @@ -1406,7 +1395,7 @@ open_sse_main_loop: pxor %xmm11,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 @@ -1418,7 +1407,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 @@ -1427,7 +1416,7 @@ open_sse_main_loop: pxor %xmm11,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 @@ -1437,21 +1426,21 @@ open_sse_main_loop: .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 - movdqa 80(%rbp),%xmm11 - movq 0+0(%rbp),%rax + movdqa 0+80(%rbp),%xmm11 + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movdqa %xmm9,80(%rbp) + movdqa %xmm9,0+80(%rbp) paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 - pshufb .rol16(%rip),%xmm15 + pshufb L$rol16(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 @@ -1460,7 +1449,7 @@ open_sse_main_loop: pxor %xmm9,%xmm7 paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 - pshufb .rol8(%rip),%xmm15 + pshufb L$rol8(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 @@ -1470,21 +1459,21 @@ open_sse_main_loop: .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 - movdqa 80(%rbp),%xmm9 - movq 8+0(%rbp),%rax + movdqa 0+80(%rbp),%xmm9 + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx - movdqa %xmm11,80(%rbp) + movdqa %xmm11,0+80(%rbp) paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 @@ -1493,7 +1482,7 @@ open_sse_main_loop: pxor %xmm11,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm11 @@ -1505,7 +1494,7 @@ open_sse_main_loop: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 @@ -1514,7 +1503,7 @@ open_sse_main_loop: pxor %xmm11,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm11 @@ -1529,7 +1518,7 @@ open_sse_main_loop: adcq %rdx,%r9 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 @@ -1538,7 +1527,7 @@ open_sse_main_loop: pxor %xmm11,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm11 @@ -1548,7 +1537,7 @@ open_sse_main_loop: .byte 102,15,58,15,246,12 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,4 - movdqa 80(%rbp),%xmm11 + movdqa 0+80(%rbp),%xmm11 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -1558,16 +1547,15 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - movdqa %xmm9,80(%rbp) + movdqa %xmm9,0+80(%rbp) paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 - pshufb .rol16(%rip),%xmm15 + pshufb L$rol16(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 @@ -1576,7 +1564,7 @@ open_sse_main_loop: pxor %xmm9,%xmm7 paddd %xmm7,%xmm3 pxor %xmm3,%xmm15 - pshufb .rol8(%rip),%xmm15 + pshufb L$rol8(%rip),%xmm15 paddd %xmm15,%xmm11 pxor %xmm11,%xmm7 movdqa %xmm7,%xmm9 @@ -1586,34 +1574,35 @@ open_sse_main_loop: .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 - movdqa 80(%rbp),%xmm9 + movdqa 0+80(%rbp),%xmm9 addq $16,%r8 cmpq $160,%r8 - jb 1b + jb L$open_sse_tail_256_rounds_and_x1hash + movq %rbx,%rcx andq $-16,%rcx -1: - addq 0(%rsi,%r8), %r10 - adcq 8+0(%rsi,%r8), %r11 +L$open_sse_tail_256_hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -1629,33 +1618,32 @@ open_sse_main_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq $16,%r8 cmpq %rcx,%r8 - jb 1b - paddd .chacha20_consts(%rip),%xmm3 - paddd 48(%rbp),%xmm7 - paddd 64(%rbp),%xmm11 - paddd 144(%rbp),%xmm15 - paddd .chacha20_consts(%rip),%xmm2 - paddd 48(%rbp),%xmm6 - paddd 64(%rbp),%xmm10 - paddd 128(%rbp),%xmm14 - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 - movdqa %xmm12,80(%rbp) + jb L$open_sse_tail_256_hash + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm12 pxor %xmm3,%xmm12 movdqu %xmm12,0 + 0(%rdi) @@ -1693,15 +1681,15 @@ open_sse_main_loop: movdqu %xmm9,32 + 128(%rdi) movdqu %xmm15,48 + 128(%rdi) - movdqa 80(%rbp),%xmm12 + movdqa 0+80(%rbp),%xmm12 subq $192,%rbx leaq 192(%rsi),%rsi leaq 192(%rdi),%rdi -open_sse_tail_64_dec_loop: +L$open_sse_tail_64_dec_loop: cmpq $16,%rbx - jb 1f + jb L$open_sse_tail_16_init subq $16,%rbx movdqu (%rsi),%xmm3 pxor %xmm3,%xmm0 @@ -1711,61 +1699,60 @@ open_sse_tail_64_dec_loop: movdqa %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm12,%xmm8 - jmp open_sse_tail_64_dec_loop -1: + jmp L$open_sse_tail_64_dec_loop +L$open_sse_tail_16_init: movdqa %xmm0,%xmm1 -open_sse_tail_16: +L$open_sse_tail_16: testq %rbx,%rbx - jz open_sse_finalize + jz L$open_sse_finalize pxor %xmm3,%xmm3 - leaq -1(%rsi,%rbx), %rsi + leaq -1(%rsi,%rbx,1),%rsi movq %rbx,%r8 -2: +L$open_sse_tail_16_compose: pslldq $1,%xmm3 pinsrb $0,(%rsi),%xmm3 subq $1,%rsi subq $1,%r8 - jnz 2b + jnz L$open_sse_tail_16_compose -3: .byte 102,73,15,126,221 pextrq $1,%xmm3,%r14 pxor %xmm1,%xmm3 -2: +L$open_sse_tail_16_extract: pextrb $0,%xmm3,(%rdi) psrldq $1,%xmm3 addq $1,%rdi subq $1,%rbx - jne 2b + jne L$open_sse_tail_16_extract addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -1781,35 +1768,34 @@ open_sse_tail_16: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -open_sse_finalize: - addq 32(%rbp),%r10 - adcq 8+32(%rbp),%r11 +L$open_sse_finalize: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -1825,9 +1811,8 @@ open_sse_finalize: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -1843,16 +1828,17 @@ open_sse_finalize: cmovcq %r14,%r11 cmovcq %r15,%r12 - addq 0+16(%rbp),%r10 - adcq 8+16(%rbp),%r11 + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + + + addq $288 + 0 + 32,%rsp - addq $288 + 32,%rsp popq %r9 movq %r10,(%r9) movq %r11,8(%r9) - popq %r15 popq %r14 @@ -1867,9 +1853,9 @@ open_sse_finalize: .byte 0xf3,0xc3 +L$open_sse_128: -open_sse_128: - movdqu .chacha20_consts(%rip),%xmm0 + movdqu L$chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqu 0(%r9),%xmm4 @@ -1880,17 +1866,18 @@ open_sse_128: movdqa %xmm8,%xmm10 movdqu 32(%r9),%xmm12 movdqa %xmm12,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm14 - paddd .sse_inc(%rip),%xmm14 + paddd L$sse_inc(%rip),%xmm14 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa %xmm13,%xmm15 movq $10,%r10 -1: + +L$open_sse_128_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1899,7 +1886,7 @@ open_sse_128: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1911,7 +1898,7 @@ open_sse_128: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1920,7 +1907,7 @@ open_sse_128: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1932,7 +1919,7 @@ open_sse_128: .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -1941,7 +1928,7 @@ open_sse_128: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -1953,7 +1940,7 @@ open_sse_128: .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1962,7 +1949,7 @@ open_sse_128: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -1974,7 +1961,7 @@ open_sse_128: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1983,7 +1970,7 @@ open_sse_128: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -1995,7 +1982,7 @@ open_sse_128: .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -2004,7 +1991,7 @@ open_sse_128: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -2016,30 +2003,30 @@ open_sse_128: .byte 102,69,15,58,15,246,4 decq %r10 - jnz 1b - paddd .chacha20_consts(%rip),%xmm0 - paddd .chacha20_consts(%rip),%xmm1 - paddd .chacha20_consts(%rip),%xmm2 + jnz L$open_sse_128_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd L$chacha20_consts(%rip),%xmm1 + paddd L$chacha20_consts(%rip),%xmm2 paddd %xmm7,%xmm4 paddd %xmm7,%xmm5 paddd %xmm7,%xmm6 paddd %xmm11,%xmm9 paddd %xmm11,%xmm10 paddd %xmm15,%xmm13 - paddd .sse_inc(%rip),%xmm15 + paddd L$sse_inc(%rip),%xmm15 paddd %xmm15,%xmm14 - pand .clamp(%rip),%xmm0 - movdqa %xmm0,0(%rbp) - movdqa %xmm4,16(%rbp) + pand L$clamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal -1: +L$open_sse_128_xor_hash: cmpq $16,%rbx - jb open_sse_tail_16 + jb L$open_sse_tail_16 subq $16,%rbx - addq 0(%rsi),%r10 + addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 @@ -2049,23 +2036,23 @@ open_sse_128: movdqu %xmm1,0(%rdi) leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -2081,9 +2068,8 @@ open_sse_128: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -2096,8 +2082,10 @@ open_sse_128: movdqa %xmm6,%xmm2 movdqa %xmm10,%xmm6 movdqa %xmm14,%xmm10 - jmp 1b - jmp open_sse_tail_16 + jmp L$open_sse_128_xor_hash + + + @@ -2126,20 +2114,15 @@ _chacha20_poly1305_seal: pushq %r9 - subq $288 + 32,%rsp - - - - - - + subq $288 + 0 + 32,%rsp leaq 32(%rsp),%rbp andq $-32,%rbp + movq 56(%r9),%rbx addq %rdx,%rbx - movq %rbx,8+32(%rbp) - movq %r8,0+32(%rbp) + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) movq %rdx,%rbx movl _OPENSSL_ia32cap_P+8(%rip),%eax @@ -2148,12 +2131,13 @@ _chacha20_poly1305_seal: jz chacha20_poly1305_seal_avx2 cmpq $128,%rbx - jbe seal_sse_128 + jbe L$seal_sse_128 - movdqa .chacha20_consts(%rip),%xmm0 + movdqa L$chacha20_consts(%rip),%xmm0 movdqu 0(%r9),%xmm4 movdqu 16(%r9),%xmm8 movdqu 32(%r9),%xmm12 + movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqa %xmm0,%xmm3 @@ -2164,22 +2148,22 @@ _chacha20_poly1305_seal: movdqa %xmm8,%xmm10 movdqa %xmm8,%xmm11 movdqa %xmm12,%xmm15 - paddd .sse_inc(%rip),%xmm12 + paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,%xmm14 - paddd .sse_inc(%rip),%xmm12 + paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,%xmm13 - paddd .sse_inc(%rip),%xmm12 - - movdqa %xmm4,48(%rbp) - movdqa %xmm8,64(%rbp) - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) - movdqa %xmm14,128(%rbp) - movdqa %xmm15,144(%rbp) + paddd L$sse_inc(%rip),%xmm12 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) movq $10,%r10 -1: - movdqa %xmm8,80(%rbp) - movdqa .rol16(%rip),%xmm8 +L$seal_sse_init_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2192,7 +2176,7 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -2201,7 +2185,7 @@ _chacha20_poly1305_seal: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 @@ -2218,7 +2202,7 @@ _chacha20_poly1305_seal: psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 - movdqa .rol8(%rip),%xmm8 + movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2231,7 +2215,7 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -2240,7 +2224,7 @@ _chacha20_poly1305_seal: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 @@ -2257,7 +2241,7 @@ _chacha20_poly1305_seal: psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,4 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,12 @@ -2270,8 +2254,8 @@ _chacha20_poly1305_seal: .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 - movdqa %xmm8,80(%rbp) - movdqa .rol16(%rip),%xmm8 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2284,7 +2268,7 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -2293,7 +2277,7 @@ _chacha20_poly1305_seal: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 @@ -2310,7 +2294,7 @@ _chacha20_poly1305_seal: psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 - movdqa .rol8(%rip),%xmm8 + movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2323,7 +2307,7 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -2332,7 +2316,7 @@ _chacha20_poly1305_seal: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 @@ -2349,7 +2333,7 @@ _chacha20_poly1305_seal: psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 @@ -2364,28 +2348,28 @@ _chacha20_poly1305_seal: .byte 102,69,15,58,15,228,4 decq %r10 - jnz 1b - paddd .chacha20_consts(%rip),%xmm3 - paddd 48(%rbp),%xmm7 - paddd 64(%rbp),%xmm11 - paddd 144(%rbp),%xmm15 - paddd .chacha20_consts(%rip),%xmm2 - paddd 48(%rbp),%xmm6 - paddd 64(%rbp),%xmm10 - paddd 128(%rbp),%xmm14 - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 - - - pand .clamp(%rip),%xmm3 - movdqa %xmm3,0(%rbp) - movdqa %xmm7,16(%rbp) + jnz L$seal_sse_init_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + + pand L$clamp(%rip),%xmm3 + movdqa %xmm3,0+0(%rbp) + movdqa %xmm7,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal @@ -2415,12 +2399,12 @@ _chacha20_poly1305_seal: movdqu %xmm15,48 + 64(%rdi) cmpq $192,%rbx - ja 1f + ja L$seal_sse_main_init movq $128,%rcx subq $128,%rbx leaq 128(%rsi),%rsi - jmp seal_sse_128_seal_hash -1: + jmp L$seal_sse_128_tail_hash +L$seal_sse_main_init: movdqu 0 + 128(%rsi),%xmm3 movdqu 16 + 128(%rsi),%xmm7 movdqu 32 + 128(%rsi),%xmm11 @@ -2440,16 +2424,16 @@ _chacha20_poly1305_seal: movq $2,%rcx movq $8,%r8 cmpq $64,%rbx - jbe seal_sse_tail_64 + jbe L$seal_sse_tail_64 cmpq $128,%rbx - jbe seal_sse_tail_128 + jbe L$seal_sse_tail_128 cmpq $192,%rbx - jbe seal_sse_tail_192 + jbe L$seal_sse_tail_192 -1: - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 +L$seal_sse_main_loop: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 @@ -2459,22 +2443,23 @@ _chacha20_poly1305_seal: movdqa %xmm0,%xmm3 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 - movdqa 96(%rbp),%xmm15 - paddd .sse_inc(%rip),%xmm15 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 movdqa %xmm15,%xmm14 - paddd .sse_inc(%rip),%xmm14 + paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) - movdqa %xmm14,128(%rbp) - movdqa %xmm15,144(%rbp) - -2: - movdqa %xmm8,80(%rbp) - movdqa .rol16(%rip),%xmm8 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + +.p2align 5 +L$seal_sse_main_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2487,19 +2472,19 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 @@ -2516,17 +2501,17 @@ _chacha20_poly1305_seal: psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movdqa .rol8(%rip),%xmm8 + movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2539,26 +2524,26 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 paddd %xmm12,%xmm8 pxor %xmm11,%xmm7 pxor %xmm10,%xmm6 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 @@ -2575,7 +2560,7 @@ _chacha20_poly1305_seal: psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 imulq %r12,%r9 addq %r10,%r15 adcq %rdx,%r9 @@ -2591,8 +2576,8 @@ _chacha20_poly1305_seal: .byte 102,15,58,15,228,4 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,12 - movdqa %xmm8,80(%rbp) - movdqa .rol16(%rip),%xmm8 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2608,9 +2593,8 @@ _chacha20_poly1305_seal: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -2620,7 +2604,7 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -2629,7 +2613,7 @@ _chacha20_poly1305_seal: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $20,%xmm8 pslld $32-20,%xmm7 @@ -2646,7 +2630,7 @@ _chacha20_poly1305_seal: psrld $20,%xmm8 pslld $32-20,%xmm4 pxor %xmm8,%xmm4 - movdqa .rol8(%rip),%xmm8 + movdqa L$rol8(%rip),%xmm8 paddd %xmm7,%xmm3 paddd %xmm6,%xmm2 paddd %xmm5,%xmm1 @@ -2659,7 +2643,7 @@ _chacha20_poly1305_seal: .byte 102,69,15,56,0,240 .byte 102,69,15,56,0,232 .byte 102,69,15,56,0,224 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 paddd %xmm15,%xmm11 paddd %xmm14,%xmm10 paddd %xmm13,%xmm9 @@ -2668,7 +2652,7 @@ _chacha20_poly1305_seal: pxor %xmm10,%xmm6 pxor %xmm9,%xmm5 pxor %xmm8,%xmm4 - movdqa %xmm8,80(%rbp) + movdqa %xmm8,0+80(%rbp) movdqa %xmm7,%xmm8 psrld $25,%xmm8 pslld $32-25,%xmm7 @@ -2685,7 +2669,7 @@ _chacha20_poly1305_seal: psrld $25,%xmm8 pslld $32-25,%xmm4 pxor %xmm8,%xmm4 - movdqa 80(%rbp),%xmm8 + movdqa 0+80(%rbp),%xmm8 .byte 102,15,58,15,255,12 .byte 102,69,15,58,15,219,8 .byte 102,69,15,58,15,255,4 @@ -2701,27 +2685,27 @@ _chacha20_poly1305_seal: leaq 16(%rdi),%rdi decq %r8 - jge 2b - addq 0(%rdi),%r10 + jge L$seal_sse_main_rounds + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -2737,35 +2721,34 @@ _chacha20_poly1305_seal: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi decq %rcx - jg 2b - paddd .chacha20_consts(%rip),%xmm3 - paddd 48(%rbp),%xmm7 - paddd 64(%rbp),%xmm11 - paddd 144(%rbp),%xmm15 - paddd .chacha20_consts(%rip),%xmm2 - paddd 48(%rbp),%xmm6 - paddd 64(%rbp),%xmm10 - paddd 128(%rbp),%xmm14 - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 - - movdqa %xmm14,80(%rbp) - movdqa %xmm14,80(%rbp) + jg L$seal_sse_main_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + movdqa %xmm14,0+80(%rbp) + movdqa %xmm14,0+80(%rbp) movdqu 0 + 0(%rsi),%xmm14 pxor %xmm3,%xmm14 movdqu %xmm14,0 + 0(%rdi) @@ -2779,7 +2762,7 @@ _chacha20_poly1305_seal: pxor %xmm15,%xmm14 movdqu %xmm14,48 + 0(%rdi) - movdqa 80(%rbp),%xmm14 + movdqa 0+80(%rbp),%xmm14 movdqu 0 + 64(%rsi),%xmm3 movdqu 16 + 64(%rsi),%xmm7 movdqu 32 + 64(%rsi),%xmm11 @@ -2806,13 +2789,13 @@ _chacha20_poly1305_seal: movdqu %xmm15,48 + 128(%rdi) cmpq $256,%rbx - ja 3f + ja L$seal_sse_main_loop_xor movq $192,%rcx subq $192,%rbx leaq 192(%rsi),%rsi - jmp seal_sse_128_seal_hash -3: + jmp L$seal_sse_128_tail_hash +L$seal_sse_main_loop_xor: movdqu 0 + 192(%rsi),%xmm3 movdqu 16 + 192(%rsi),%xmm7 movdqu 32 + 192(%rsi),%xmm11 @@ -2831,43 +2814,45 @@ _chacha20_poly1305_seal: movq $6,%rcx movq $4,%r8 cmpq $192,%rbx - jg 1b + jg L$seal_sse_main_loop movq %rbx,%rcx testq %rbx,%rbx - je seal_sse_128_seal_hash + je L$seal_sse_128_tail_hash movq $6,%rcx + cmpq $128,%rbx + ja L$seal_sse_tail_192 cmpq $64,%rbx - jg 3f - -seal_sse_tail_64: - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 - movdqa 96(%rbp),%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - -1: - addq 0(%rdi),%r10 + ja L$seal_sse_tail_128 + +L$seal_sse_tail_64: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + +L$seal_sse_tail_64_rounds_and_x2hash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -2883,18 +2868,17 @@ seal_sse_tail_64: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: +L$seal_sse_tail_64_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -2903,7 +2887,7 @@ seal_sse_tail_64: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -2915,7 +2899,7 @@ seal_sse_tail_64: .byte 102,69,15,58,15,228,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -2924,7 +2908,7 @@ seal_sse_tail_64: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -2934,26 +2918,26 @@ seal_sse_tail_64: .byte 102,15,58,15,228,12 .byte 102,69,15,58,15,192,8 .byte 102,69,15,58,15,228,4 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -2969,63 +2953,59 @@ seal_sse_tail_64: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_sse_tail_64_rounds_and_x2hash decq %r8 - jge 2b - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 - - jmp seal_sse_128_seal -3: - cmpq $128,%rbx - jg 3f - -seal_sse_tail_128: - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 + jge L$seal_sse_tail_64_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp L$seal_sse_128_tail_xor + +L$seal_sse_tail_128: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 - movdqa 96(%rbp),%xmm13 - paddd .sse_inc(%rip),%xmm13 + movdqa 0+96(%rbp),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) -1: - addq 0(%rdi),%r10 +L$seal_sse_tail_128_rounds_and_x2hash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3041,18 +3021,17 @@ seal_sse_tail_128: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: +L$seal_sse_tail_128_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3061,7 +3040,7 @@ seal_sse_tail_128: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3073,7 +3052,7 @@ seal_sse_tail_128: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3082,7 +3061,7 @@ seal_sse_tail_128: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3092,26 +3071,26 @@ seal_sse_tail_128: .byte 102,15,58,15,237,4 .byte 102,69,15,58,15,201,8 .byte 102,69,15,58,15,237,12 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3127,15 +3106,14 @@ seal_sse_tail_128: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3144,7 +3122,7 @@ seal_sse_tail_128: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3156,7 +3134,7 @@ seal_sse_tail_128: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3165,7 +3143,7 @@ seal_sse_tail_128: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3178,17 +3156,17 @@ seal_sse_tail_128: leaq 16(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_sse_tail_128_rounds_and_x2hash decq %r8 - jge 2b - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 + jge L$seal_sse_tail_128_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 @@ -3205,50 +3183,49 @@ seal_sse_tail_128: movq $64,%rcx subq $64,%rbx leaq 64(%rsi),%rsi - jmp seal_sse_128_seal_hash -3: + jmp L$seal_sse_128_tail_hash -seal_sse_tail_192: - movdqa .chacha20_consts(%rip),%xmm0 - movdqa 48(%rbp),%xmm4 - movdqa 64(%rbp),%xmm8 +L$seal_sse_tail_192: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm5 movdqa %xmm8,%xmm9 movdqa %xmm0,%xmm2 movdqa %xmm4,%xmm6 movdqa %xmm8,%xmm10 - movdqa 96(%rbp),%xmm14 - paddd .sse_inc(%rip),%xmm14 + movdqa 0+96(%rbp),%xmm14 + paddd L$sse_inc(%rip),%xmm14 movdqa %xmm14,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm13,%xmm12 - paddd .sse_inc(%rip),%xmm12 - movdqa %xmm12,96(%rbp) - movdqa %xmm13,112(%rbp) - movdqa %xmm14,128(%rbp) + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) -1: - addq 0(%rdi),%r10 +L$seal_sse_tail_192_rounds_and_x2hash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3264,18 +3241,17 @@ seal_sse_tail_192: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: +L$seal_sse_tail_192_rounds_and_x1hash: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3284,7 +3260,7 @@ seal_sse_tail_192: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3296,7 +3272,7 @@ seal_sse_tail_192: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3305,7 +3281,7 @@ seal_sse_tail_192: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3317,7 +3293,7 @@ seal_sse_tail_192: .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -3326,7 +3302,7 @@ seal_sse_tail_192: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -3336,26 +3312,26 @@ seal_sse_tail_192: .byte 102,15,58,15,246,4 .byte 102,69,15,58,15,210,8 .byte 102,69,15,58,15,246,12 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3371,15 +3347,14 @@ seal_sse_tail_192: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3388,7 +3363,7 @@ seal_sse_tail_192: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3400,7 +3375,7 @@ seal_sse_tail_192: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3409,7 +3384,7 @@ seal_sse_tail_192: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3421,7 +3396,7 @@ seal_sse_tail_192: .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -3430,7 +3405,7 @@ seal_sse_tail_192: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -3443,21 +3418,21 @@ seal_sse_tail_192: leaq 16(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_sse_tail_192_rounds_and_x2hash decq %r8 - jge 2b - paddd .chacha20_consts(%rip),%xmm2 - paddd 48(%rbp),%xmm6 - paddd 64(%rbp),%xmm10 - paddd 128(%rbp),%xmm14 - paddd .chacha20_consts(%rip),%xmm1 - paddd 48(%rbp),%xmm5 - paddd 64(%rbp),%xmm9 - paddd 112(%rbp),%xmm13 - paddd .chacha20_consts(%rip),%xmm0 - paddd 48(%rbp),%xmm4 - paddd 64(%rbp),%xmm8 - paddd 96(%rbp),%xmm12 + jge L$seal_sse_tail_192_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 movdqu 0 + 0(%rsi),%xmm3 movdqu 16 + 0(%rsi),%xmm7 movdqu 32 + 0(%rsi),%xmm11 @@ -3487,29 +3462,29 @@ seal_sse_tail_192: subq $128,%rbx leaq 128(%rsi),%rsi -seal_sse_128_seal_hash: +L$seal_sse_128_tail_hash: cmpq $16,%rcx - jb seal_sse_128_seal - addq 0(%rdi),%r10 + jb L$seal_sse_128_tail_xor + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3525,20 +3500,19 @@ seal_sse_128_seal_hash: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx leaq 16(%rdi),%rdi - jmp seal_sse_128_seal_hash + jmp L$seal_sse_128_tail_hash -seal_sse_128_seal: +L$seal_sse_128_tail_xor: cmpq $16,%rbx - jb seal_sse_tail_16 + jb L$seal_sse_tail_16 subq $16,%rbx movdqu 0(%rsi),%xmm3 @@ -3550,23 +3524,23 @@ seal_sse_128_seal: adcq $1,%r12 leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3582,9 +3556,8 @@ seal_sse_128_seal: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -3597,22 +3570,22 @@ seal_sse_128_seal: movdqa %xmm5,%xmm1 movdqa %xmm9,%xmm5 movdqa %xmm13,%xmm9 - jmp seal_sse_128_seal + jmp L$seal_sse_128_tail_xor -seal_sse_tail_16: +L$seal_sse_tail_16: testq %rbx,%rbx - jz process_blocks_of_extra_in + jz L$process_blocks_of_extra_in movq %rbx,%r8 movq %rbx,%rcx - leaq -1(%rsi,%rbx), %rsi + leaq -1(%rsi,%rbx,1),%rsi pxor %xmm15,%xmm15 -1: +L$seal_sse_tail_16_compose: pslldq $1,%xmm15 pinsrb $0,(%rsi),%xmm15 leaq -1(%rsi),%rsi decq %rcx - jne 1b + jne L$seal_sse_tail_16_compose pxor %xmm0,%xmm15 @@ -3620,12 +3593,12 @@ seal_sse_tail_16: movq %rbx,%rcx movdqu %xmm15,%xmm0 -2: +L$seal_sse_tail_16_extract: pextrb $0,%xmm0,(%rdi) psrldq $1,%xmm0 addq $1,%rdi subq $1,%rcx - jnz 2b + jnz L$seal_sse_tail_16_extract @@ -3634,23 +3607,23 @@ seal_sse_tail_16: - movq 288+32(%rsp),%r9 + movq 288 + 0 + 32(%rsp),%r9 movq 56(%r9),%r14 movq 48(%r9),%r13 testq %r14,%r14 - jz process_partial_block + jz L$process_partial_block movq $16,%r15 subq %rbx,%r15 cmpq %r15,%r14 - jge load_extra_in + jge L$load_extra_in movq %r14,%r15 -load_extra_in: +L$load_extra_in: - leaq -1(%r13,%r15), %rsi + leaq -1(%r13,%r15,1),%rsi addq %r15,%r13 @@ -3664,29 +3637,29 @@ load_extra_in: pxor %xmm11,%xmm11 -3: +L$load_extra_load_loop: pslldq $1,%xmm11 pinsrb $0,(%rsi),%xmm11 leaq -1(%rsi),%rsi subq $1,%r15 - jnz 3b + jnz L$load_extra_load_loop movq %rbx,%r15 -4: +L$load_extra_shift_loop: pslldq $1,%xmm11 subq $1,%r15 - jnz 4b + jnz L$load_extra_shift_loop - leaq .and_masks(%rip),%r15 + leaq L$and_masks(%rip),%r15 shlq $4,%rbx - pand -16(%r15,%rbx), %xmm15 + pand -16(%r15,%rbx,1),%xmm15 por %xmm11,%xmm15 @@ -3698,23 +3671,23 @@ load_extra_in: addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3730,44 +3703,43 @@ load_extra_in: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -process_blocks_of_extra_in: +L$process_blocks_of_extra_in: - movq 288+32(%rsp),%r9 + movq 288+32+0 (%rsp),%r9 movq 48(%r9),%rsi movq 56(%r9),%r8 movq %r8,%rcx shrq $4,%r8 -5: +L$process_extra_hash_loop: jz process_extra_in_trailer - addq 0(%rsi),%r10 + addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3783,57 +3755,55 @@ process_blocks_of_extra_in: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rsi),%rsi subq $1,%r8 - jmp 5b - + jmp L$process_extra_hash_loop process_extra_in_trailer: andq $15,%rcx movq %rcx,%rbx - jz do_length_block - leaq -1(%rsi,%rcx), %rsi + jz L$do_length_block + leaq -1(%rsi,%rcx,1),%rsi -6: +L$process_extra_in_trailer_load: pslldq $1,%xmm15 pinsrb $0,(%rsi),%xmm15 leaq -1(%rsi),%rsi subq $1,%rcx - jnz 6b + jnz L$process_extra_in_trailer_load -process_partial_block: +L$process_partial_block: - leaq .and_masks(%rip),%r15 + leaq L$and_masks(%rip),%r15 shlq $4,%rbx - pand -16(%r15,%rbx), %xmm15 + pand -16(%r15,%rbx,1),%xmm15 .byte 102,77,15,126,253 pextrq $1,%xmm15,%r14 addq %r13,%r10 adcq %r14,%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3849,35 +3819,34 @@ process_partial_block: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -do_length_block: - addq 32(%rbp),%r10 - adcq 8+32(%rbp),%r11 +L$do_length_block: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -3893,9 +3862,8 @@ do_length_block: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -3911,16 +3879,17 @@ do_length_block: cmovcq %r14,%r11 cmovcq %r15,%r12 - addq 0+16(%rbp),%r10 - adcq 8+16(%rbp),%r11 + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + + + addq $288 + 0 + 32,%rsp - addq $288 + 32,%rsp popq %r9 - movq %r10,0(%r9) + movq %r10,(%r9) movq %r11,8(%r9) - popq %r15 popq %r14 @@ -3935,9 +3904,9 @@ do_length_block: .byte 0xf3,0xc3 +L$seal_sse_128: -seal_sse_128: - movdqu .chacha20_consts(%rip),%xmm0 + movdqu L$chacha20_consts(%rip),%xmm0 movdqa %xmm0,%xmm1 movdqa %xmm0,%xmm2 movdqu 0(%r9),%xmm4 @@ -3948,17 +3917,18 @@ seal_sse_128: movdqa %xmm8,%xmm10 movdqu 32(%r9),%xmm14 movdqa %xmm14,%xmm12 - paddd .sse_inc(%rip),%xmm12 + paddd L$sse_inc(%rip),%xmm12 movdqa %xmm12,%xmm13 - paddd .sse_inc(%rip),%xmm13 + paddd L$sse_inc(%rip),%xmm13 movdqa %xmm4,%xmm7 movdqa %xmm8,%xmm11 movdqa %xmm12,%xmm15 movq $10,%r10 -1: + +L$seal_sse_128_rounds: paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3967,7 +3937,7 @@ seal_sse_128: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -3979,7 +3949,7 @@ seal_sse_128: .byte 102,69,15,58,15,228,12 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -3988,7 +3958,7 @@ seal_sse_128: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -4000,7 +3970,7 @@ seal_sse_128: .byte 102,69,15,58,15,237,12 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -4009,7 +3979,7 @@ seal_sse_128: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -4021,7 +3991,7 @@ seal_sse_128: .byte 102,69,15,58,15,246,12 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol16(%rip),%xmm12 + pshufb L$rol16(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -4030,7 +4000,7 @@ seal_sse_128: pxor %xmm3,%xmm4 paddd %xmm4,%xmm0 pxor %xmm0,%xmm12 - pshufb .rol8(%rip),%xmm12 + pshufb L$rol8(%rip),%xmm12 paddd %xmm12,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm3 @@ -4042,7 +4012,7 @@ seal_sse_128: .byte 102,69,15,58,15,228,4 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol16(%rip),%xmm13 + pshufb L$rol16(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -4051,7 +4021,7 @@ seal_sse_128: pxor %xmm3,%xmm5 paddd %xmm5,%xmm1 pxor %xmm1,%xmm13 - pshufb .rol8(%rip),%xmm13 + pshufb L$rol8(%rip),%xmm13 paddd %xmm13,%xmm9 pxor %xmm9,%xmm5 movdqa %xmm5,%xmm3 @@ -4063,7 +4033,7 @@ seal_sse_128: .byte 102,69,15,58,15,237,4 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol16(%rip),%xmm14 + pshufb L$rol16(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -4072,7 +4042,7 @@ seal_sse_128: pxor %xmm3,%xmm6 paddd %xmm6,%xmm2 pxor %xmm2,%xmm14 - pshufb .rol8(%rip),%xmm14 + pshufb L$rol8(%rip),%xmm14 paddd %xmm14,%xmm10 pxor %xmm10,%xmm6 movdqa %xmm6,%xmm3 @@ -4084,51 +4054,64 @@ seal_sse_128: .byte 102,69,15,58,15,246,4 decq %r10 - jnz 1b - paddd .chacha20_consts(%rip),%xmm0 - paddd .chacha20_consts(%rip),%xmm1 - paddd .chacha20_consts(%rip),%xmm2 + jnz L$seal_sse_128_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd L$chacha20_consts(%rip),%xmm1 + paddd L$chacha20_consts(%rip),%xmm2 paddd %xmm7,%xmm4 paddd %xmm7,%xmm5 paddd %xmm7,%xmm6 paddd %xmm11,%xmm8 paddd %xmm11,%xmm9 paddd %xmm15,%xmm12 - paddd .sse_inc(%rip),%xmm15 + paddd L$sse_inc(%rip),%xmm15 paddd %xmm15,%xmm13 - pand .clamp(%rip),%xmm2 - movdqa %xmm2,0(%rbp) - movdqa %xmm6,16(%rbp) + pand L$clamp(%rip),%xmm2 + movdqa %xmm2,0+0(%rbp) + movdqa %xmm6,0+16(%rbp) movq %r8,%r8 call poly_hash_ad_internal - jmp seal_sse_128_seal + jmp L$seal_sse_128_tail_xor + .p2align 6 chacha20_poly1305_open_avx2: + + + + + + + + + + + + vzeroupper - vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa L$chacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 vbroadcasti128 16(%r9),%ymm8 vbroadcasti128 32(%r9),%ymm12 - vpaddd .avx2_init(%rip),%ymm12,%ymm12 + vpaddd L$avx2_init(%rip),%ymm12,%ymm12 cmpq $192,%rbx - jbe open_avx2_192 + jbe L$open_avx2_192 cmpq $320,%rbx - jbe open_avx2_320 + jbe L$open_avx2_320 - vmovdqa %ymm4,64(%rbp) - vmovdqa %ymm8,96(%rbp) - vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,0+160(%rbp) movq $10,%r10 -1: +L$open_avx2_init_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -4136,7 +4119,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -4147,7 +4130,7 @@ chacha20_poly1305_open_avx2: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -4155,7 +4138,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -4166,45 +4149,45 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm4,%ymm4,%ymm4 decq %r10 - jne 1b - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jne L$open_avx2_init_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vpand .clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0(%rbp) + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 movq %r8,%r8 call poly_hash_ad_internal - xorq %rcx,%rcx -1: - addq 0(%rsi,%rcx), %r10 - adcq 8+0(%rsi,%rcx), %r11 + xorq %rcx,%rcx +L$open_avx2_init_hash: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -4220,31 +4203,31 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 addq $16,%rcx cmpq $64,%rcx - jne 1b + jne L$open_avx2_init_hash vpxor 0(%rsi),%ymm0,%ymm0 vpxor 32(%rsi),%ymm4,%ymm4 + vmovdqu %ymm0,0(%rdi) vmovdqu %ymm4,32(%rdi) leaq 64(%rsi),%rsi leaq 64(%rdi),%rdi subq $64,%rbx -1: +L$open_avx2_main_loop: cmpq $512,%rbx - jb 3f - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 + jb L$open_avx2_main_loop_done + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 @@ -4254,23 +4237,23 @@ chacha20_poly1305_open_avx2: vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm15 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,256(%rbp) - vmovdqa %ymm14,224(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) xorq %rcx,%rcx -2: - addq 0*8(%rsi,%rcx), %r10 - adcq 8+0*8(%rsi,%rcx), %r11 +L$open_avx2_main_loop_rounds: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 adcq $1,%r12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -4279,7 +4262,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx @@ -4290,23 +4273,22 @@ chacha20_poly1305_open_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 - movq 8+0(%rbp),%rdx + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -4314,18 +4296,19 @@ chacha20_poly1305_open_avx2: vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 addq %rax,%r15 adcq %rdx,%r9 - vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -4335,13 +4318,11 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 @@ -4349,27 +4330,26 @@ chacha20_poly1305_open_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 - addq 2*8(%rsi,%rcx), %r10 - adcq 8+2*8(%rsi,%rcx), %r11 - adcq $1,%r12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rsi,%rcx,1),%r10 + adcq 8+16(%rsi,%rcx,1),%r11 + adcq $1,%r12 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - movq 0+0(%rbp),%rdx + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - vmovdqa %ymm8,128(%rbp) - vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 @@ -4381,28 +4361,28 @@ chacha20_poly1305_open_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 - movq 8+0(%rbp),%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -4411,17 +4391,19 @@ chacha20_poly1305_open_avx2: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 - addq %rax,%r15 - adcq %rdx,%r9 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -4431,35 +4413,31 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 - addq 4*8(%rsi,%rcx), %r10 - adcq 8+4*8(%rsi,%rcx), %r11 - adcq $1,%r12 - - leaq 48(%rcx),%rcx vpsrld $20,%ymm6,%ymm8 vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rsi,%rcx,1),%r10 + adcq 8+32(%rsi,%rcx,1),%r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -4468,49 +4446,48 @@ chacha20_poly1305_open_avx2: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 - movq 0+0(%rbp),%rdx + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - movq 8+0(%rbp),%rdx + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 - addq %rax,%r15 - adcq %rdx,%r9 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 @@ -4518,6 +4495,10 @@ chacha20_poly1305_open_avx2: vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -4527,39 +4508,34 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 cmpq $60*8,%rcx - jne 2b - vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 64(%rbp),%ymm7,%ymm7 - vpaddd 96(%rbp),%ymm11,%ymm11 - vpaddd 256(%rbp),%ymm15,%ymm15 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,128(%rbp) - addq 60*8(%rsi),%r10 + jne L$open_avx2_main_loop_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+60*8(%rsi),%r10 adcq 8+60*8(%rsi),%r11 adcq $1,%r12 vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 @@ -4575,24 +4551,24 @@ chacha20_poly1305_open_avx2: vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) - vmovdqa 128(%rbp),%ymm0 - movq 0+0(%rbp),%rax + vmovdqa 0+128(%rbp),%ymm0 + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -4608,9 +4584,8 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -4626,7 +4601,7 @@ chacha20_poly1305_open_avx2: vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) - addq 60*8+16(%rsi),%r10 + addq 0+60*8+16(%rsi),%r10 adcq 8+60*8+16(%rsi),%r11 adcq $1,%r12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 @@ -4641,23 +4616,23 @@ chacha20_poly1305_open_avx2: vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -4673,9 +4648,8 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -4695,47 +4669,51 @@ chacha20_poly1305_open_avx2: leaq 512(%rsi),%rsi leaq 512(%rdi),%rdi subq $512,%rbx - jmp 1b -3: + jmp L$open_avx2_main_loop +L$open_avx2_main_loop_done: testq %rbx,%rbx vzeroupper - je open_sse_finalize -3: + je L$open_sse_finalize + + cmpq $384,%rbx + ja L$open_avx2_tail_512 + cmpq $256,%rbx + ja L$open_avx2_tail_384 cmpq $128,%rbx - ja 3f - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm12 - vmovdqa %ymm12,160(%rbp) + ja L$open_avx2_tail_256 + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) xorq %r8,%r8 movq %rbx,%rcx andq $-16,%rcx testq %rcx,%rcx - je 2f -1: - addq 0*8(%rsi,%r8), %r10 - adcq 8+0*8(%rsi,%r8), %r11 + je L$open_avx2_tail_128_rounds +L$open_avx2_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -4751,18 +4729,17 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -2: +L$open_avx2_tail_128_rounds: addq $16,%r8 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -4770,7 +4747,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -4781,7 +4758,7 @@ chacha20_poly1305_open_avx2: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -4789,7 +4766,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -4800,36 +4777,35 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm4,%ymm4,%ymm4 cmpq %rcx,%r8 - jb 1b + jb L$open_avx2_tail_128_rounds_and_x1hash cmpq $160,%r8 - jne 2b - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jne L$open_avx2_tail_128_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 - jmp open_avx2_tail_loop -3: - cmpq $256,%rbx - ja 3f - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_256: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm13 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) - movq %rbx,128(%rbp) + movq %rbx,0+128(%rbp) movq %rbx,%rcx subq $128,%rcx shrq $4,%rcx @@ -4838,18 +4814,18 @@ chacha20_poly1305_open_avx2: cmovgq %r8,%rcx movq %rsi,%rbx xorq %r8,%r8 -1: - addq 0(%rbx),%r10 +L$open_avx2_tail_256_rounds_and_x1hash: + addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -4867,18 +4843,17 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx -2: +L$open_avx2_tail_256_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -4886,7 +4861,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -4897,7 +4872,7 @@ chacha20_poly1305_open_avx2: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -4905,7 +4880,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -4918,7 +4893,7 @@ chacha20_poly1305_open_avx2: incq %r8 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -4926,7 +4901,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -4937,7 +4912,7 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -4945,7 +4920,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -4956,7 +4931,7 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -4964,7 +4939,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -4975,28 +4950,28 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm6,%ymm6,%ymm6 cmpq %rcx,%r8 - jb 1b + jb L$open_avx2_tail_256_rounds_and_x1hash cmpq $10,%r8 - jne 2b + jne L$open_avx2_tail_256_rounds movq %rbx,%r8 subq %rsi,%rbx movq %rbx,%rcx - movq 128(%rbp),%rbx -1: + movq 0+128(%rbp),%rbx +L$open_avx2_tail_256_hash: addq $16,%rcx cmpq %rbx,%rcx - jg 1f - addq 0(%r8),%r10 + jg L$open_avx2_tail_256_done + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -5014,24 +4989,23 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 - jmp 1b -1: - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jmp L$open_avx2_tail_256_hash +L$open_avx2_tail_256_done: + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 @@ -5053,28 +5027,27 @@ chacha20_poly1305_open_avx2: leaq 128(%rsi),%rsi leaq 128(%rdi),%rdi subq $128,%rbx - jmp open_avx2_tail_loop -3: - cmpq $384,%rbx - ja 3f - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_384: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm14 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) - movq %rbx,128(%rbp) + movq %rbx,0+128(%rbp) movq %rbx,%rcx subq $256,%rcx shrq $4,%rcx @@ -5084,18 +5057,18 @@ chacha20_poly1305_open_avx2: cmovgq %r8,%rcx movq %rsi,%rbx xorq %r8,%r8 -1: - addq 0(%rbx),%r10 +L$open_avx2_tail_384_rounds_and_x2hash: + addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -5113,18 +5086,17 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rbx),%rbx -2: +L$open_avx2_tail_384_rounds_and_x1hash: vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -5132,7 +5104,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -5143,7 +5115,7 @@ chacha20_poly1305_open_avx2: vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -5151,7 +5123,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -5162,7 +5134,7 @@ chacha20_poly1305_open_avx2: vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -5170,7 +5142,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -5179,26 +5151,26 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 - addq 0(%rbx),%r10 + addq 0+0(%rbx),%r10 adcq 8+0(%rbx),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -5214,9 +5186,8 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -5225,7 +5196,7 @@ chacha20_poly1305_open_avx2: incq %r8 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -5233,7 +5204,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -5244,7 +5215,7 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm6,%ymm6,%ymm6 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -5252,7 +5223,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -5263,7 +5234,7 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -5271,7 +5242,7 @@ chacha20_poly1305_open_avx2: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -5282,28 +5253,28 @@ chacha20_poly1305_open_avx2: vpalignr $12,%ymm4,%ymm4,%ymm4 cmpq %rcx,%r8 - jb 1b + jb L$open_avx2_tail_384_rounds_and_x2hash cmpq $10,%r8 - jne 2b + jne L$open_avx2_tail_384_rounds_and_x1hash movq %rbx,%r8 subq %rsi,%rbx movq %rbx,%rcx - movq 128(%rbp),%rbx -1: + movq 0+128(%rbp),%rbx +L$open_avx2_384_tail_hash: addq $16,%rcx cmpq %rbx,%rcx - jg 1f - addq 0(%r8),%r10 + jg L$open_avx2_384_tail_done + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -5321,28 +5292,27 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 - jmp 1b -1: - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jmp L$open_avx2_384_tail_hash +L$open_avx2_384_tail_done: + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 @@ -5376,11 +5346,12 @@ chacha20_poly1305_open_avx2: leaq 256(%rsi),%rsi leaq 256(%rdi),%rdi subq $256,%rbx - jmp open_avx2_tail_loop -3: - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_512: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 @@ -5390,39 +5361,39 @@ chacha20_poly1305_open_avx2: vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm15 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,256(%rbp) - vmovdqa %ymm14,224(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) xorq %rcx,%rcx movq %rsi,%r8 -1: - addq 0(%r8),%r10 +L$open_avx2_tail_512_rounds_and_x2hash: + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -5438,17 +5409,16 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 -2: - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 +L$open_avx2_tail_512_rounds_and_x1hash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -5461,16 +5431,15 @@ chacha20_poly1305_open_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -5483,18 +5452,19 @@ chacha20_poly1305_open_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 - addq 0(%r8),%r10 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -5512,13 +5482,11 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 @@ -5530,16 +5498,15 @@ chacha20_poly1305_open_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -5552,7 +5519,7 @@ chacha20_poly1305_open_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 @@ -5565,18 +5532,20 @@ chacha20_poly1305_open_avx2: vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - addq 16(%r8),%r10 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+16(%r8),%r10 adcq 8+16(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -5594,16 +5563,13 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%r8),%r8 - vmovdqa .rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 @@ -5615,16 +5581,15 @@ chacha20_poly1305_open_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -5637,7 +5602,7 @@ chacha20_poly1305_open_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -5650,16 +5615,15 @@ chacha20_poly1305_open_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -5672,7 +5636,7 @@ chacha20_poly1305_open_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 @@ -5688,26 +5652,26 @@ chacha20_poly1305_open_avx2: incq %rcx cmpq $4,%rcx - jl 1b + jl L$open_avx2_tail_512_rounds_and_x2hash cmpq $10,%rcx - jne 2b + jne L$open_avx2_tail_512_rounds_and_x1hash movq %rbx,%rcx subq $384,%rcx andq $-16,%rcx -1: +L$open_avx2_tail_512_hash: testq %rcx,%rcx - je 1f - addq 0(%r8),%r10 + je L$open_avx2_tail_512_done + addq 0+0(%r8),%r10 adcq 8+0(%r8),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -5725,35 +5689,34 @@ chacha20_poly1305_open_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%r8),%r8 subq $16,%rcx - jmp 1b -1: - vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 64(%rbp),%ymm7,%ymm7 - vpaddd 96(%rbp),%ymm11,%ymm11 - vpaddd 256(%rbp),%ymm15,%ymm15 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,128(%rbp) + jmp L$open_avx2_tail_512_hash +L$open_avx2_tail_512_done: + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 @@ -5767,7 +5730,7 @@ chacha20_poly1305_open_avx2: vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) - vmovdqa 128(%rbp),%ymm0 + vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 @@ -5801,9 +5764,9 @@ chacha20_poly1305_open_avx2: leaq 384(%rsi),%rsi leaq 384(%rdi),%rdi subq $384,%rbx -open_avx2_tail_loop: +L$open_avx2_tail_128_xor: cmpq $32,%rbx - jb open_avx2_tail + jb L$open_avx2_tail_32_xor subq $32,%rbx vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) @@ -5812,11 +5775,11 @@ open_avx2_tail_loop: vmovdqa %ymm4,%ymm0 vmovdqa %ymm8,%ymm4 vmovdqa %ymm12,%ymm8 - jmp open_avx2_tail_loop -open_avx2_tail: + jmp L$open_avx2_tail_128_xor +L$open_avx2_tail_32_xor: cmpq $16,%rbx vmovdqa %xmm0,%xmm1 - jb 1f + jb L$open_avx2_exit subq $16,%rbx vpxor (%rsi),%xmm0,%xmm1 @@ -5825,25 +5788,25 @@ open_avx2_tail: leaq 16(%rdi),%rdi vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 vmovdqa %xmm0,%xmm1 -1: +L$open_avx2_exit: vzeroupper - jmp open_sse_tail_16 + jmp L$open_sse_tail_16 -open_avx2_192: +L$open_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 - vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 vmovdqa %ymm12,%ymm11 vmovdqa %ymm13,%ymm15 movq $10,%r10 -1: +L$open_avx2_192_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -5851,7 +5814,7 @@ open_avx2_192: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -5862,7 +5825,7 @@ open_avx2_192: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -5870,7 +5833,7 @@ open_avx2_192: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -5881,7 +5844,7 @@ open_avx2_192: vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -5889,7 +5852,7 @@ open_avx2_192: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -5900,7 +5863,7 @@ open_avx2_192: vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -5908,7 +5871,7 @@ open_avx2_192: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -5919,7 +5882,7 @@ open_avx2_192: vpalignr $12,%ymm5,%ymm5,%ymm5 decq %r10 - jne 1b + jne L$open_avx2_192_rounds vpaddd %ymm2,%ymm0,%ymm0 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm6,%ymm4,%ymm4 @@ -5930,8 +5893,8 @@ open_avx2_192: vpaddd %ymm15,%ymm13,%ymm13 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vpand .clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0(%rbp) + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 @@ -5939,33 +5902,33 @@ open_avx2_192: vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 -open_avx2_short: +L$open_avx2_short: movq %r8,%r8 call poly_hash_ad_internal -open_avx2_hash_and_xor_loop: +L$open_avx2_short_hash_and_xor_loop: cmpq $32,%rbx - jb open_avx2_short_tail_32 + jb L$open_avx2_short_tail_32 subq $32,%rbx - addq 0(%rsi),%r10 + addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -5981,32 +5944,31 @@ open_avx2_hash_and_xor_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - addq 16(%rsi),%r10 + addq 0+16(%rsi),%r10 adcq 8+16(%rsi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -6022,9 +5984,8 @@ open_avx2_hash_and_xor_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -6044,32 +6005,32 @@ open_avx2_hash_and_xor_loop: vmovdqa %ymm13,%ymm9 vmovdqa %ymm2,%ymm13 vmovdqa %ymm6,%ymm2 - jmp open_avx2_hash_and_xor_loop -open_avx2_short_tail_32: + jmp L$open_avx2_short_hash_and_xor_loop +L$open_avx2_short_tail_32: cmpq $16,%rbx vmovdqa %xmm0,%xmm1 - jb 1f + jb L$open_avx2_short_tail_32_exit subq $16,%rbx - addq 0(%rsi),%r10 + addq 0+0(%rsi),%r10 adcq 8+0(%rsi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -6085,9 +6046,8 @@ open_avx2_short_tail_32: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -6097,29 +6057,29 @@ open_avx2_short_tail_32: leaq 16(%rsi),%rsi leaq 16(%rdi),%rdi vextracti128 $1,%ymm0,%xmm1 -1: +L$open_avx2_short_tail_32_exit: vzeroupper - jmp open_sse_tail_16 + jmp L$open_sse_tail_16 -open_avx2_320: +L$open_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 - vpaddd .avx2_inc(%rip),%ymm12,%ymm13 - vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) movq $10,%r10 -1: +L$open_avx2_320_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -6127,7 +6087,7 @@ open_avx2_320: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -6138,7 +6098,7 @@ open_avx2_320: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -6146,7 +6106,7 @@ open_avx2_320: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -6157,7 +6117,7 @@ open_avx2_320: vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -6165,7 +6125,7 @@ open_avx2_320: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -6176,7 +6136,7 @@ open_avx2_320: vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -6184,7 +6144,7 @@ open_avx2_320: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -6195,7 +6155,7 @@ open_avx2_320: vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -6203,7 +6163,7 @@ open_avx2_320: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -6214,7 +6174,7 @@ open_avx2_320: vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -6222,7 +6182,7 @@ open_avx2_320: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -6233,23 +6193,23 @@ open_avx2_320: vpalignr $12,%ymm6,%ymm6,%ymm6 decq %r10 - jne 1b - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + jne L$open_avx2_320_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd %ymm7,%ymm4,%ymm4 vpaddd %ymm7,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 vpaddd %ymm11,%ymm8,%ymm8 vpaddd %ymm11,%ymm9,%ymm9 vpaddd %ymm11,%ymm10,%ymm10 - vpaddd 160(%rbp),%ymm12,%ymm12 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vpand .clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0(%rbp) + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 @@ -6261,46 +6221,59 @@ open_avx2_320: vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 - jmp open_avx2_short + jmp L$open_avx2_short + .p2align 6 chacha20_poly1305_seal_avx2: + + + + + + + + + + + + vzeroupper - vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa L$chacha20_consts(%rip),%ymm0 vbroadcasti128 0(%r9),%ymm4 vbroadcasti128 16(%r9),%ymm8 vbroadcasti128 32(%r9),%ymm12 - vpaddd .avx2_init(%rip),%ymm12,%ymm12 + vpaddd L$avx2_init(%rip),%ymm12,%ymm12 cmpq $192,%rbx - jbe seal_avx2_192 + jbe L$seal_avx2_192 cmpq $320,%rbx - jbe seal_avx2_320 + jbe L$seal_avx2_320 vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm4,%ymm7 - vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm4,0+64(%rbp) vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 vmovdqa %ymm8,%ymm11 - vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm8,0+96(%rbp) vmovdqa %ymm12,%ymm15 - vpaddd .avx2_inc(%rip),%ymm15,%ymm14 - vpaddd .avx2_inc(%rip),%ymm14,%ymm13 - vpaddd .avx2_inc(%rip),%ymm13,%ymm12 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm14,224(%rbp) - vmovdqa %ymm15,256(%rbp) + vpaddd L$avx2_inc(%rip),%ymm15,%ymm14 + vpaddd L$avx2_inc(%rip),%ymm14,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm15,0+256(%rbp) movq $10,%r10 -1: - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 +L$seal_avx2_init_rounds: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6313,16 +6286,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6335,7 +6307,7 @@ chacha20_poly1305_seal_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6348,16 +6320,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6370,7 +6341,7 @@ chacha20_poly1305_seal_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 @@ -6383,8 +6354,8 @@ chacha20_poly1305_seal_avx2: vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6397,16 +6368,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6419,7 +6389,7 @@ chacha20_poly1305_seal_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6432,16 +6402,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6454,7 +6423,7 @@ chacha20_poly1305_seal_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 @@ -6469,29 +6438,29 @@ chacha20_poly1305_seal_avx2: vpalignr $4,%ymm12,%ymm12,%ymm12 decq %r10 - jnz 1b - vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 64(%rbp),%ymm7,%ymm7 - vpaddd 96(%rbp),%ymm11,%ymm11 - vpaddd 256(%rbp),%ymm15,%ymm15 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jnz L$seal_avx2_init_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 - vpand .clamp(%rip),%ymm15,%ymm15 - vmovdqa %ymm15,0(%rbp) + vpand L$clamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0+0(%rbp) movq %r8,%r8 call poly_hash_ad_internal @@ -6533,7 +6502,7 @@ chacha20_poly1305_seal_avx2: subq $320,%rbx movq $320,%rcx cmpq $128,%rbx - jbe seal_avx2_hash + jbe L$seal_avx2_short_hash_remainder vpxor 0(%rsi),%ymm0,%ymm0 vpxor 32(%rsi),%ymm4,%ymm4 vpxor 64(%rsi),%ymm8,%ymm8 @@ -6547,16 +6516,16 @@ chacha20_poly1305_seal_avx2: movq $8,%rcx movq $2,%r8 cmpq $128,%rbx - jbe seal_avx2_tail_128 + jbe L$seal_avx2_tail_128 cmpq $256,%rbx - jbe seal_avx2_tail_256 + jbe L$seal_avx2_tail_256 cmpq $384,%rbx - jbe seal_avx2_tail_384 + jbe L$seal_avx2_tail_384 cmpq $512,%rbx - jbe seal_avx2_tail_512 - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 + jbe L$seal_avx2_tail_512 + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 @@ -6566,17 +6535,17 @@ chacha20_poly1305_seal_avx2: vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm15 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,256(%rbp) - vmovdqa %ymm14,224(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6589,16 +6558,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6611,7 +6579,7 @@ chacha20_poly1305_seal_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6624,16 +6592,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6646,7 +6613,7 @@ chacha20_poly1305_seal_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 @@ -6659,8 +6626,8 @@ chacha20_poly1305_seal_avx2: vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6673,16 +6640,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6695,7 +6661,7 @@ chacha20_poly1305_seal_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6708,16 +6674,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6730,7 +6695,7 @@ chacha20_poly1305_seal_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 @@ -6743,8 +6708,8 @@ chacha20_poly1305_seal_avx2: vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6757,16 +6722,15 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6779,19 +6743,21 @@ chacha20_poly1305_seal_avx2: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 subq $16,%rdi movq $9,%rcx - jmp 4f -1: - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 + jmp L$seal_avx2_main_loop_rounds_entry +.p2align 5 +L$seal_avx2_main_loop: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 @@ -6801,23 +6767,24 @@ chacha20_poly1305_seal_avx2: vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm15 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,256(%rbp) - vmovdqa %ymm14,224(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) movq $10,%rcx -2: - addq 0(%rdi),%r10 +.p2align 5 +L$seal_avx2_main_loop_rounds: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6826,7 +6793,7 @@ chacha20_poly1305_seal_avx2: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx @@ -6837,23 +6804,22 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 - movq 8+0(%rbp),%rdx + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -6861,18 +6827,19 @@ chacha20_poly1305_seal_avx2: vpslld $32-20,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 addq %rax,%r15 adcq %rdx,%r9 - vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -6882,15 +6849,13 @@ chacha20_poly1305_seal_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 -4: - vpxor %ymm3,%ymm15,%ymm15 +L$seal_avx2_main_loop_rounds_entry: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 @@ -6898,27 +6863,26 @@ chacha20_poly1305_seal_avx2: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 - addq 16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - movq 0+0(%rbp),%rdx + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - vmovdqa %ymm8,128(%rbp) - vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 @@ -6930,28 +6894,28 @@ chacha20_poly1305_seal_avx2: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 vpalignr $4,%ymm6,%ymm6,%ymm6 - movq 8+0(%rbp),%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -6960,17 +6924,19 @@ chacha20_poly1305_seal_avx2: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 - addq %rax,%r15 - adcq %rdx,%r9 vpshufb %ymm8,%ymm15,%ymm15 vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -6980,35 +6946,31 @@ chacha20_poly1305_seal_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 - addq 32(%rdi),%r10 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rdi),%r10 adcq 8+32(%rdi),%r11 adcq $1,%r12 leaq 48(%rdi),%rdi - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 vpsrld $20,%ymm5,%ymm8 vpslld $32-20,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -7017,49 +6979,48 @@ chacha20_poly1305_seal_avx2: vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 vpxor %ymm0,%ymm12,%ymm12 - movq 0+0(%rbp),%rdx + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - movq 8+0(%rbp),%rdx + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 - addq %rax,%r15 - adcq %rdx,%r9 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 vpsrld $25,%ymm5,%ymm8 vpslld $32-25,%ymm5,%ymm5 vpxor %ymm8,%ymm5,%ymm5 vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 @@ -7067,6 +7028,10 @@ chacha20_poly1305_seal_avx2: vpalignr $8,%ymm10,%ymm10,%ymm10 vpalignr $4,%ymm14,%ymm14,%ymm14 vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -7076,78 +7041,51 @@ chacha20_poly1305_seal_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm12,%ymm12,%ymm12 decq %rcx - jne 2b - vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 64(%rbp),%ymm7,%ymm7 - vpaddd 96(%rbp),%ymm11,%ymm11 - vpaddd 256(%rbp),%ymm15,%ymm15 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 - - leaq 32(%rdi),%rdi - vmovdqa %ymm0,128(%rbp) - addq -32(%rdi),%r10 - adcq 8+-32(%rdi),%r11 + jne L$seal_avx2_main_loop_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 adcq $1,%r12 - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 128(%rbp),%ymm0 - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 @@ -7158,12 +7096,60 @@ chacha20_poly1305_seal_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 + + leaq 32(%rdi),%rdi + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 @@ -7176,9 +7162,6 @@ chacha20_poly1305_seal_avx2: vmovdqu %ymm2,32+128(%rdi) vmovdqu %ymm6,64+128(%rdi) vmovdqu %ymm10,96+128(%rdi) - addq -16(%rdi),%r10 - adcq 8+-16(%rdi),%r11 - adcq $1,%r12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 @@ -7191,44 +7174,6 @@ chacha20_poly1305_seal_avx2: vmovdqu %ymm1,32+256(%rdi) vmovdqu %ymm5,64+256(%rdi) vmovdqu %ymm9,96+256(%rdi) - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 @@ -7245,32 +7190,26 @@ chacha20_poly1305_seal_avx2: leaq 512(%rsi),%rsi subq $512,%rbx cmpq $512,%rbx - jg 1b - addq 0(%rdi),%r10 + jg L$seal_avx2_main_loop + + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 @@ -7281,37 +7220,29 @@ chacha20_poly1305_seal_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - addq 16(%rdi),%r10 + addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 @@ -7322,9 +7253,8 @@ chacha20_poly1305_seal_avx2: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -7332,43 +7262,41 @@ chacha20_poly1305_seal_avx2: leaq 32(%rdi),%rdi movq $10,%rcx xorq %r8,%r8 - cmpq $128,%rbx - ja 3f - -seal_avx2_tail_128: - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm12 - vmovdqa %ymm12,160(%rbp) -1: - addq 0(%rdi),%r10 + cmpq $384,%rbx + ja L$seal_avx2_tail_512 + cmpq $256,%rbx + ja L$seal_avx2_tail_384 + cmpq $128,%rbx + ja L$seal_avx2_tail_256 + +L$seal_avx2_tail_128: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + +L$seal_avx2_tail_128_rounds_and_3xhash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 @@ -7379,18 +7307,17 @@ seal_avx2_tail_128: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: +L$seal_avx2_tail_128_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -7398,7 +7325,7 @@ seal_avx2_tail_128: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -7407,31 +7334,24 @@ seal_avx2_tail_128: vpalignr $12,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $4,%ymm4,%ymm4,%ymm4 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 @@ -7442,15 +7362,14 @@ seal_avx2_tail_128: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -7458,7 +7377,7 @@ seal_avx2_tail_128: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -7467,31 +7386,24 @@ seal_avx2_tail_128: vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 - addq 16(%rdi),%r10 + addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0(%rbp),%rax - mulq %r11 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0(%rbp),%rax - mulq %r11 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 adcq %rdx,%r9 movq %r13,%r10 movq %r14,%r11 @@ -7502,67 +7414,63 @@ seal_avx2_tail_128: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_avx2_tail_128_rounds_and_3xhash decq %r8 - jge 2b - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jge L$seal_avx2_tail_128_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 vmovdqa %ymm3,%ymm8 - jmp seal_avx2_short_loop -3: - cmpq $256,%rbx - ja 3f + jmp L$seal_avx2_short_loop -seal_avx2_tail_256: - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 +L$seal_avx2_tail_256: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm13 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) -1: - addq 0(%rdi),%r10 +L$seal_avx2_tail_256_rounds_and_3xhash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -7578,18 +7486,17 @@ seal_avx2_tail_256: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: +L$seal_avx2_tail_256_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -7597,7 +7504,7 @@ seal_avx2_tail_256: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -7608,7 +7515,7 @@ seal_avx2_tail_256: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -7616,7 +7523,7 @@ seal_avx2_tail_256: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -7625,26 +7532,26 @@ seal_avx2_tail_256: vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -7660,15 +7567,14 @@ seal_avx2_tail_256: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -7676,7 +7582,7 @@ seal_avx2_tail_256: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -7687,7 +7593,7 @@ seal_avx2_tail_256: vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -7695,7 +7601,7 @@ seal_avx2_tail_256: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -7704,26 +7610,26 @@ seal_avx2_tail_256: vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $12,%ymm5,%ymm5,%ymm5 - addq 16(%rdi),%r10 + addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -7739,26 +7645,25 @@ seal_avx2_tail_256: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_avx2_tail_256_rounds_and_3xhash decq %r8 - jge 2b - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jge L$seal_avx2_tail_256_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 @@ -7780,50 +7685,47 @@ seal_avx2_tail_256: movq $128,%rcx leaq 128(%rsi),%rsi subq $128,%rbx - jmp seal_avx2_hash -3: - cmpq $384,%rbx - ja seal_avx2_tail_512 + jmp L$seal_avx2_short_hash_remainder -seal_avx2_tail_384: - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 +L$seal_avx2_tail_384: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm10 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm14 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) -1: - addq 0(%rdi),%r10 +L$seal_avx2_tail_384_rounds_and_3xhash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -7839,18 +7741,17 @@ seal_avx2_tail_384: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: +L$seal_avx2_tail_384_rounds_and_2xhash: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -7858,7 +7759,7 @@ seal_avx2_tail_384: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -7869,7 +7770,7 @@ seal_avx2_tail_384: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -7877,7 +7778,7 @@ seal_avx2_tail_384: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -7886,26 +7787,26 @@ seal_avx2_tail_384: vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm5,%ymm5,%ymm5 - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -7921,15 +7822,14 @@ seal_avx2_tail_384: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -7937,7 +7837,7 @@ seal_avx2_tail_384: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -7948,7 +7848,7 @@ seal_avx2_tail_384: vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -7956,7 +7856,7 @@ seal_avx2_tail_384: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -7965,26 +7865,26 @@ seal_avx2_tail_384: vpalignr $4,%ymm12,%ymm12,%ymm12 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm4,%ymm4,%ymm4 - addq 16(%rdi),%r10 + addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -8000,15 +7900,14 @@ seal_avx2_tail_384: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -8016,7 +7915,7 @@ seal_avx2_tail_384: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -8027,7 +7926,7 @@ seal_avx2_tail_384: vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -8035,7 +7934,7 @@ seal_avx2_tail_384: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -8047,21 +7946,21 @@ seal_avx2_tail_384: leaq 32(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_avx2_tail_384_rounds_and_3xhash decq %r8 - jge 2b - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 + jge L$seal_avx2_tail_384_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 @@ -8095,12 +7994,12 @@ seal_avx2_tail_384: movq $256,%rcx leaq 256(%rsi),%rsi subq $256,%rbx - jmp seal_avx2_hash + jmp L$seal_avx2_short_hash_remainder -seal_avx2_tail_512: - vmovdqa .chacha20_consts(%rip),%ymm0 - vmovdqa 64(%rbp),%ymm4 - vmovdqa 96(%rbp),%ymm8 +L$seal_avx2_tail_512: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 vmovdqa %ymm0,%ymm1 vmovdqa %ymm4,%ymm5 vmovdqa %ymm8,%ymm9 @@ -8110,28 +8009,28 @@ seal_avx2_tail_512: vmovdqa %ymm0,%ymm3 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa .avx2_inc(%rip),%ymm12 - vpaddd 160(%rbp),%ymm12,%ymm15 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 vpaddd %ymm15,%ymm12,%ymm14 vpaddd %ymm14,%ymm12,%ymm13 vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,256(%rbp) - vmovdqa %ymm14,224(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) -1: - addq 0(%rdi),%r10 +L$seal_avx2_tail_512_rounds_and_3xhash: + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rdx + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rdx + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 @@ -8149,17 +8048,16 @@ seal_avx2_tail_512: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi -2: - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 +L$seal_avx2_tail_512_rounds_and_2xhash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -8172,19 +8070,18 @@ seal_avx2_tail_512: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 - addq 0(%rdi),%r10 + vpxor %ymm10,%ymm6,%ymm6 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -8197,18 +8094,18 @@ seal_avx2_tail_512: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 - movq 0+0(%rbp),%rdx + vpaddd %ymm4,%ymm0,%ymm0 + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 vpxor %ymm2,%ymm14,%ymm14 vpxor %ymm1,%ymm13,%ymm13 @@ -8217,26 +8114,25 @@ seal_avx2_tail_512: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $25,%ymm7,%ymm8 - movq 8+0(%rbp),%rdx + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 vpslld $32-25,%ymm6,%ymm6 vpxor %ymm8,%ymm6,%ymm6 @@ -8246,7 +8142,7 @@ seal_avx2_tail_512: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $4,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $12,%ymm15,%ymm15,%ymm15 @@ -8255,14 +8151,14 @@ seal_avx2_tail_512: vpalignr $12,%ymm14,%ymm14,%ymm14 vpalignr $4,%ymm5,%ymm5,%ymm5 vpalignr $8,%ymm9,%ymm9,%ymm9 - addq %rax,%r15 - adcq %rdx,%r9 vpalignr $12,%ymm13,%ymm13,%ymm13 vpalignr $4,%ymm4,%ymm4,%ymm4 + addq %rax,%r15 + adcq %rdx,%r9 vpalignr $8,%ymm8,%ymm8,%ymm8 vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,128(%rbp) - vmovdqa .rol16(%rip),%ymm8 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 vpaddd %ymm5,%ymm1,%ymm1 @@ -8275,8 +8171,10 @@ seal_avx2_tail_512: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 movq %r13,%r10 movq %r14,%r11 movq %r15,%r12 @@ -8286,20 +8184,16 @@ seal_avx2_tail_512: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 vpxor %ymm9,%ymm5,%ymm5 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) + vmovdqa %ymm8,0+128(%rbp) vpsrld $20,%ymm7,%ymm8 vpslld $32-20,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 @@ -8312,12 +8206,12 @@ seal_avx2_tail_512: vpsrld $20,%ymm4,%ymm8 vpslld $32-20,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - addq 16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - vmovdqa .rol8(%rip),%ymm8 + vmovdqa L$rol8(%rip),%ymm8 vpaddd %ymm7,%ymm3,%ymm3 vpaddd %ymm6,%ymm2,%ymm2 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 vpaddd %ymm5,%ymm1,%ymm1 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm3,%ymm15,%ymm15 @@ -8328,24 +8222,23 @@ seal_avx2_tail_512: vpshufb %ymm8,%ymm14,%ymm14 vpshufb %ymm8,%ymm13,%ymm13 vpshufb %ymm8,%ymm12,%ymm12 - vmovdqa 128(%rbp),%ymm8 vpaddd %ymm15,%ymm11,%ymm11 vpaddd %ymm14,%ymm10,%ymm10 vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm12,%ymm8,%ymm8 + vpaddd 0+128(%rbp),%ymm12,%ymm8 vpxor %ymm11,%ymm7,%ymm7 vpxor %ymm10,%ymm6,%ymm6 - movq 0+0(%rbp),%rdx + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx movq %rdx,%r15 mulxq %r10,%r13,%r14 mulxq %r11,%rax,%rdx imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,128(%rbp) - vpsrld $25,%ymm7,%ymm8 vpslld $32-25,%ymm7,%ymm7 vpxor %ymm8,%ymm7,%ymm7 vpsrld $25,%ymm6,%ymm8 @@ -8357,22 +8250,22 @@ seal_avx2_tail_512: vpsrld $25,%ymm4,%ymm8 vpslld $32-25,%ymm4,%ymm4 vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 128(%rbp),%ymm8 + vmovdqa 0+128(%rbp),%ymm8 vpalignr $12,%ymm7,%ymm7,%ymm7 vpalignr $8,%ymm11,%ymm11,%ymm11 vpalignr $4,%ymm15,%ymm15,%ymm15 vpalignr $12,%ymm6,%ymm6,%ymm6 - movq 8+0(%rbp),%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + movq 8+0+0(%rbp),%rdx mulxq %r10,%r10,%rax addq %r10,%r14 mulxq %r11,%r11,%r9 adcq %r11,%r15 adcq $0,%r9 imulq %r12,%rdx - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 vpalignr $4,%ymm13,%ymm13,%ymm13 vpalignr $12,%ymm4,%ymm4,%ymm4 vpalignr $8,%ymm8,%ymm8,%ymm8 @@ -8389,6 +8282,10 @@ seal_avx2_tail_512: + + + + addq %rax,%r15 adcq %rdx,%r9 @@ -8420,36 +8317,35 @@ seal_avx2_tail_512: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 32(%rdi),%rdi decq %rcx - jg 1b + jg L$seal_avx2_tail_512_rounds_and_3xhash decq %r8 - jge 2b - vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 64(%rbp),%ymm7,%ymm7 - vpaddd 96(%rbp),%ymm11,%ymm11 - vpaddd 256(%rbp),%ymm15,%ymm15 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 64(%rbp),%ymm6,%ymm6 - vpaddd 96(%rbp),%ymm10,%ymm10 - vpaddd 224(%rbp),%ymm14,%ymm14 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpaddd 96(%rbp),%ymm9,%ymm9 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 64(%rbp),%ymm4,%ymm4 - vpaddd 96(%rbp),%ymm8,%ymm8 - vpaddd 160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,128(%rbp) + jge L$seal_avx2_tail_512_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 @@ -8463,7 +8359,7 @@ seal_avx2_tail_512: vmovdqu %ymm7,64+0(%rdi) vmovdqu %ymm11,96+0(%rdi) - vmovdqa 128(%rbp),%ymm0 + vmovdqa 0+128(%rbp),%ymm0 vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 @@ -8497,27 +8393,27 @@ seal_avx2_tail_512: movq $384,%rcx leaq 384(%rsi),%rsi subq $384,%rbx - jmp seal_avx2_hash + jmp L$seal_avx2_short_hash_remainder -seal_avx2_320: +L$seal_avx2_320: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 - vpaddd .avx2_inc(%rip),%ymm12,%ymm13 - vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 vmovdqa %ymm4,%ymm7 vmovdqa %ymm8,%ymm11 - vmovdqa %ymm12,160(%rbp) - vmovdqa %ymm13,192(%rbp) - vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) movq $10,%r10 -1: +L$seal_avx2_320_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -8525,7 +8421,7 @@ seal_avx2_320: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -8536,7 +8432,7 @@ seal_avx2_320: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -8544,7 +8440,7 @@ seal_avx2_320: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -8555,7 +8451,7 @@ seal_avx2_320: vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -8563,7 +8459,7 @@ seal_avx2_320: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -8574,7 +8470,7 @@ seal_avx2_320: vpalignr $4,%ymm6,%ymm6,%ymm6 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -8582,7 +8478,7 @@ seal_avx2_320: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -8593,7 +8489,7 @@ seal_avx2_320: vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -8601,7 +8497,7 @@ seal_avx2_320: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -8612,7 +8508,7 @@ seal_avx2_320: vpalignr $12,%ymm5,%ymm5,%ymm5 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol16(%rip),%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpsrld $20,%ymm6,%ymm3 @@ -8620,7 +8516,7 @@ seal_avx2_320: vpxor %ymm3,%ymm6,%ymm6 vpaddd %ymm6,%ymm2,%ymm2 vpxor %ymm2,%ymm14,%ymm14 - vpshufb .rol8(%rip),%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 vpaddd %ymm14,%ymm10,%ymm10 vpxor %ymm10,%ymm6,%ymm6 vpslld $7,%ymm6,%ymm3 @@ -8631,23 +8527,23 @@ seal_avx2_320: vpalignr $12,%ymm6,%ymm6,%ymm6 decq %r10 - jne 1b - vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + jne L$seal_avx2_320_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 vpaddd %ymm7,%ymm4,%ymm4 vpaddd %ymm7,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 vpaddd %ymm11,%ymm8,%ymm8 vpaddd %ymm11,%ymm9,%ymm9 vpaddd %ymm11,%ymm10,%ymm10 - vpaddd 160(%rbp),%ymm12,%ymm12 - vpaddd 192(%rbp),%ymm13,%ymm13 - vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vpand .clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0(%rbp) + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 @@ -8659,23 +8555,23 @@ seal_avx2_320: vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 - jmp seal_avx2_short + jmp L$seal_avx2_short -seal_avx2_192: +L$seal_avx2_192: vmovdqa %ymm0,%ymm1 vmovdqa %ymm0,%ymm2 vmovdqa %ymm4,%ymm5 vmovdqa %ymm4,%ymm6 vmovdqa %ymm8,%ymm9 vmovdqa %ymm8,%ymm10 - vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 vmovdqa %ymm12,%ymm11 vmovdqa %ymm13,%ymm15 movq $10,%r10 -1: +L$seal_avx2_192_rounds: vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -8683,7 +8579,7 @@ seal_avx2_192: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -8694,7 +8590,7 @@ seal_avx2_192: vpalignr $4,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -8702,7 +8598,7 @@ seal_avx2_192: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -8713,7 +8609,7 @@ seal_avx2_192: vpalignr $4,%ymm5,%ymm5,%ymm5 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol16(%rip),%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpsrld $20,%ymm4,%ymm3 @@ -8721,7 +8617,7 @@ seal_avx2_192: vpxor %ymm3,%ymm4,%ymm4 vpaddd %ymm4,%ymm0,%ymm0 vpxor %ymm0,%ymm12,%ymm12 - vpshufb .rol8(%rip),%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 vpaddd %ymm12,%ymm8,%ymm8 vpxor %ymm8,%ymm4,%ymm4 vpslld $7,%ymm4,%ymm3 @@ -8732,7 +8628,7 @@ seal_avx2_192: vpalignr $12,%ymm4,%ymm4,%ymm4 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol16(%rip),%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpsrld $20,%ymm5,%ymm3 @@ -8740,7 +8636,7 @@ seal_avx2_192: vpxor %ymm3,%ymm5,%ymm5 vpaddd %ymm5,%ymm1,%ymm1 vpxor %ymm1,%ymm13,%ymm13 - vpshufb .rol8(%rip),%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 vpaddd %ymm13,%ymm9,%ymm9 vpxor %ymm9,%ymm5,%ymm5 vpslld $7,%ymm5,%ymm3 @@ -8751,7 +8647,7 @@ seal_avx2_192: vpalignr $12,%ymm5,%ymm5,%ymm5 decq %r10 - jne 1b + jne L$seal_avx2_192_rounds vpaddd %ymm2,%ymm0,%ymm0 vpaddd %ymm2,%ymm1,%ymm1 vpaddd %ymm6,%ymm4,%ymm4 @@ -8762,8 +8658,8 @@ seal_avx2_192: vpaddd %ymm15,%ymm13,%ymm13 vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vpand .clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0(%rbp) + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 @@ -8771,33 +8667,33 @@ seal_avx2_192: vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 -seal_avx2_short: +L$seal_avx2_short: movq %r8,%r8 call poly_hash_ad_internal xorq %rcx,%rcx -seal_avx2_hash: +L$seal_avx2_short_hash_remainder: cmpq $16,%rcx - jb seal_avx2_short_loop - addq 0(%rdi),%r10 + jb L$seal_avx2_short_loop + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -8813,45 +8709,44 @@ seal_avx2_hash: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 subq $16,%rcx addq $16,%rdi - jmp seal_avx2_hash -seal_avx2_short_loop: + jmp L$seal_avx2_short_hash_remainder +L$seal_avx2_short_loop: cmpq $32,%rbx - jb seal_avx2_short_tail + jb L$seal_avx2_short_tail subq $32,%rbx vpxor (%rsi),%ymm0,%ymm0 vmovdqu %ymm0,(%rdi) leaq 32(%rsi),%rsi - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -8867,32 +8762,31 @@ seal_avx2_short_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 - addq 16(%rdi),%r10 + addq 0+16(%rdi),%r10 adcq 8+16(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -8908,9 +8802,8 @@ seal_avx2_short_loop: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 @@ -8926,34 +8819,34 @@ seal_avx2_short_loop: vmovdqa %ymm13,%ymm9 vmovdqa %ymm2,%ymm13 vmovdqa %ymm6,%ymm2 - jmp seal_avx2_short_loop -seal_avx2_short_tail: + jmp L$seal_avx2_short_loop +L$seal_avx2_short_tail: cmpq $16,%rbx - jb 1f + jb L$seal_avx2_exit subq $16,%rbx vpxor (%rsi),%xmm0,%xmm3 vmovdqu %xmm3,(%rdi) leaq 16(%rsi),%rsi - addq 0(%rdi),%r10 + addq 0+0(%rdi),%r10 adcq 8+0(%rdi),%r11 adcq $1,%r12 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax movq %rax,%r15 mulq %r10 movq %rax,%r13 movq %rdx,%r14 - movq 0+0(%rbp),%rax + movq 0+0+0(%rbp),%rax mulq %r11 imulq %r12,%r15 addq %rax,%r14 adcq %rdx,%r15 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax movq %rax,%r9 mulq %r10 addq %rax,%r14 adcq $0,%rdx movq %rdx,%r10 - movq 8+0(%rbp),%rax + movq 8+0+0(%rbp),%rax mulq %r11 addq %rax,%r15 adcq $0,%rdx @@ -8969,17 +8862,17 @@ seal_avx2_short_tail: movq %r9,%r14 shrdq $2,%r9,%r15 shrq $2,%r9 - addq %r13,%r10 - adcq %r14,%r11 - adcq $0,%r12 + addq %r13,%r15 + adcq %r14,%r9 addq %r15,%r10 adcq %r9,%r11 adcq $0,%r12 leaq 16(%rdi),%rdi vextracti128 $1,%ymm0,%xmm0 -1: +L$seal_avx2_exit: vzeroupper - jmp seal_sse_tail_16 + jmp L$seal_sse_tail_16 + #endif |