diff options
author | Robert Sloan <varomodt@google.com> | 2017-02-21 08:49:28 -0800 |
---|---|---|
committer | Robert Sloan <varomodt@google.com> | 2017-02-21 08:49:42 -0800 |
commit | a94fe0531b3c196ad078174259af2201b2e3a246 (patch) | |
tree | 81f252f2c833966b0a5d3ec52e71c3f9dbeca499 | |
parent | 5d625781eb8ff5cc8111d2302efe900103bf0ade (diff) | |
download | boringssl-a94fe0531b3c196ad078174259af2201b2e3a246.tar.gz |
external/boringssl: Sync to c4796c92e0aced2342ed5687201aea07189c3bc1.
This includes the following changes:
https://boringssl.googlesource.com/boringssl/+log/040bc4944be97f5d4b44da176f6e801fc804a176..c4796c92e0aced2342ed5687201aea07189c3bc1
Test: CtsLibcoreTestCases Presubmits
Change-Id: If6d911660fbd9c60896527addb277c8225c3d401
124 files changed, 9625 insertions, 3674 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION index f26983b6..af7f21c4 100644 --- a/BORINGSSL_REVISION +++ b/BORINGSSL_REVISION @@ -1 +1 @@ -040bc4944be97f5d4b44da176f6e801fc804a176 +c4796c92e0aced2342ed5687201aea07189c3bc1 diff --git a/linux-arm/crypto/chacha/chacha-armv4.S b/linux-arm/crypto/chacha/chacha-armv4.S index 19a4d2c4..0784fc71 100644 --- a/linux-arm/crypto/chacha/chacha-armv4.S +++ b/linux-arm/crypto/chacha/chacha-armv4.S @@ -1457,7 +1457,7 @@ ChaCha20_neon: ldrb r9,[r12],#1 @ read input subs r11,r11,#1 eor r8,r8,r9 - strb r8,[r14],#1 @ store ouput + strb r8,[r14],#1 @ store output bne .Loop_tail_neon .Ldone_neon: diff --git a/linux-x86/crypto/bn/x86-mont.S b/linux-x86/crypto/bn/x86-mont.S index 1569b2cf..e291a888 100644 --- a/linux-x86/crypto/bn/x86-mont.S +++ b/linux-x86/crypto/bn/x86-mont.S @@ -17,39 +17,54 @@ bn_mul_mont: jl .L000just_leave leal 20(%esp),%esi leal 24(%esp),%edx - movl %esp,%ebp addl $2,%edi negl %edi - leal -32(%esp,%edi,4),%esp + leal -32(%esp,%edi,4),%ebp negl %edi - movl %esp,%eax + movl %ebp,%eax subl %edx,%eax andl $2047,%eax - subl %eax,%esp - xorl %esp,%edx + subl %eax,%ebp + xorl %ebp,%edx andl $2048,%edx xorl $2048,%edx - subl %edx,%esp - andl $-64,%esp + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L001page_walk + jmp .L002page_walk_done +.align 16 +.L001page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L001page_walk +.L002page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx - movl 12(%esi),%edx + movl 12(%esi),%ebp movl 16(%esi),%esi movl (%esi),%esi movl %eax,4(%esp) movl %ebx,8(%esp) movl %ecx,12(%esp) - movl %edx,16(%esp) + movl %ebp,16(%esp) movl %esi,20(%esp) leal -3(%edi),%ebx - movl %ebp,24(%esp) - call .L001PIC_me_up -.L001PIC_me_up: + movl %edx,24(%esp) + call .L003PIC_me_up +.L003PIC_me_up: popl %eax - leal OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax + leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax btl $26,(%eax) - jnc .L002non_sse2 + jnc .L004non_sse2 movl $-1,%eax movd %eax,%mm7 movl 8(%esp),%esi @@ -73,7 +88,7 @@ bn_mul_mont: psrlq $32,%mm3 incl %ecx .align 16 -.L0031st: +.L0051st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -88,7 +103,7 @@ bn_mul_mont: psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx - jl .L0031st + jl .L0051st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -102,7 +117,7 @@ bn_mul_mont: paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx -.L004outer: +.L006outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 @@ -124,7 +139,7 @@ bn_mul_mont: paddq %mm6,%mm2 incl %ecx decl %ebx -.L005inner: +.L007inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -141,7 +156,7 @@ bn_mul_mont: paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx - jnz .L005inner + jnz .L007inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 @@ -159,11 +174,11 @@ bn_mul_mont: movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx - jle .L004outer + jle .L006outer emms - jmp .L006common_tail + jmp .L008common_tail .align 16 -.L002non_sse2: +.L004non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -174,12 +189,12 @@ bn_mul_mont: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz .L007bn_sqr_mont + jz .L009bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 16 -.L008mull: +.L010mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -188,7 +203,7 @@ bn_mul_mont: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L008mull + jl .L010mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -206,9 +221,9 @@ bn_mul_mont: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp .L0092ndmadd + jmp .L0112ndmadd .align 16 -.L0101stmadd: +.L0121stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -219,7 +234,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L0101stmadd + jl .L0121stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -242,7 +257,7 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx .align 16 -.L0092ndmadd: +.L0112ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -253,7 +268,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0092ndmadd + jl .L0112ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -269,16 +284,16 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je .L006common_tail + je .L008common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp .L0101stmadd + jmp .L0121stmadd .align 16 -.L007bn_sqr_mont: +.L009bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -289,7 +304,7 @@ bn_mul_mont: andl $1,%ebx incl %ecx .align 16 -.L011sqr: +.L013sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -301,7 +316,7 @@ bn_mul_mont: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl .L011sqr + jl .L013sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -325,7 +340,7 @@ bn_mul_mont: movl 4(%esi),%eax movl $1,%ecx .align 16 -.L0123rdmadd: +.L0143rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -344,7 +359,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0123rdmadd + jl .L0143rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -360,7 +375,7 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je .L006common_tail + je .L008common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -372,12 +387,12 @@ bn_mul_mont: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je .L013sqrlast + je .L015sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 16 -.L014sqradd: +.L016sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -393,13 +408,13 @@ bn_mul_mont: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle .L014sqradd + jle .L016sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -.L013sqrlast: +.L015sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -414,9 +429,9 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp .L0123rdmadd + jmp .L0143rdmadd .align 16 -.L006common_tail: +.L008common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -424,25 +439,26 @@ bn_mul_mont: movl %ebx,%ecx xorl %edx,%edx .align 16 -.L015sub: +.L017sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge .L015sub + jge .L017sub sbbl $0,%eax + andl %eax,%esi + notl %eax + movl %edi,%ebp + andl %eax,%ebp + orl %ebp,%esi .align 16 -.L016copy: - movl (%esi,%ebx,4),%edx - movl (%edi,%ebx,4),%ebp - xorl %ebp,%edx - andl %eax,%edx - xorl %ebp,%edx - movl %ecx,(%esi,%ebx,4) - movl %edx,(%edi,%ebx,4) +.L018copy: + movl (%esi,%ebx,4),%eax + movl %eax,(%edi,%ebx,4) + movl %ecx,32(%esp,%ebx,4) decl %ebx - jge .L016copy + jge .L018copy movl 24(%esp),%esp movl $1,%eax .L000just_leave: diff --git a/linux-x86_64/crypto/aes/aes-x86_64.S b/linux-x86_64/crypto/aes/aes-x86_64.S index 361e84c7..ab1168ed 100644 --- a/linux-x86_64/crypto/aes/aes-x86_64.S +++ b/linux-x86_64/crypto/aes/aes-x86_64.S @@ -332,6 +332,7 @@ _x86_64_AES_encrypt_compact: .type asm_AES_encrypt,@function .hidden asm_AES_encrypt asm_AES_encrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -340,7 +341,6 @@ asm_AES_encrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -350,7 +350,7 @@ asm_AES_encrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) .Lenc_prologue: movq %rdx,%r15 @@ -382,13 +382,13 @@ asm_AES_encrypt: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lenc_epilogue: .byte 0xf3,0xc3 .size asm_AES_encrypt,.-asm_AES_encrypt @@ -778,6 +778,7 @@ _x86_64_AES_decrypt_compact: .type asm_AES_decrypt,@function .hidden asm_AES_decrypt asm_AES_decrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -786,7 +787,6 @@ asm_AES_decrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -796,7 +796,7 @@ asm_AES_decrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) .Ldec_prologue: movq %rdx,%r15 @@ -830,13 +830,13 @@ asm_AES_decrypt: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Ldec_epilogue: .byte 0xf3,0xc3 .size asm_AES_decrypt,.-asm_AES_decrypt @@ -1313,10 +1313,9 @@ asm_AES_cbc_encrypt: movl %r9d,%r9d leaq .LAES_Te(%rip),%r14 + leaq .LAES_Td(%rip),%r10 cmpq $0,%r9 - jne .Lcbc_picked_te - leaq .LAES_Td(%rip),%r14 -.Lcbc_picked_te: + cmoveq %r10,%r14 movl OPENSSL_ia32cap_P(%rip),%r10d cmpq $512,%rdx diff --git a/linux-x86_64/crypto/aes/aesni-x86_64.S b/linux-x86_64/crypto/aes/aesni-x86_64.S index 5709a2d0..a90e9350 100644 --- a/linux-x86_64/crypto/aes/aesni-x86_64.S +++ b/linux-x86_64/crypto/aes/aesni-x86_64.S @@ -1032,11 +1032,10 @@ aesni_ctr32_encrypt_blocks: .align 16 .Lctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1045,7 +1044,7 @@ aesni_ctr32_encrypt_blocks: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1061,8 +1060,8 @@ aesni_ctr32_encrypt_blocks: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1071,25 +1070,25 @@ aesni_ctr32_encrypt_blocks: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl OPENSSL_ia32cap_P+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1113,7 +1112,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp .Lctr32_loop6 @@ -1124,32 +1123,32 @@ aesni_ctr32_encrypt_blocks: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1210,7 +1209,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1223,7 +1222,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1237,7 +1236,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1251,7 +1250,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1265,7 +1264,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1279,7 +1278,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1293,7 +1292,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1308,7 +1307,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1543,7 +1542,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1567,8 +1566,8 @@ aesni_ctr32_encrypt_blocks: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lctr32_epilogue: .byte 0xf3,0xc3 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks @@ -1577,11 +1576,10 @@ aesni_ctr32_encrypt_blocks: .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1597,7 +1595,7 @@ aesni_xts_encrypt: jnz .Loop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1653,9 +1651,9 @@ aesni_xts_encrypt: jc .Lxts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop @@ -1680,7 +1678,7 @@ aesni_xts_encrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1689,7 +1687,7 @@ aesni_xts_encrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1704,7 +1702,7 @@ aesni_xts_encrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 @@ -1736,7 +1734,7 @@ aesni_xts_encrypt: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1804,10 +1802,10 @@ aesni_xts_encrypt: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1834,7 +1832,7 @@ aesni_xts_encrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_enc_short: @@ -1990,7 +1988,7 @@ aesni_xts_encrypt: jnz .Lxts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2033,8 +2031,8 @@ aesni_xts_encrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -2043,11 +2041,10 @@ aesni_xts_encrypt: .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2069,7 +2066,7 @@ aesni_xts_decrypt: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2125,9 +2122,9 @@ aesni_xts_decrypt: jc .Lxts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop @@ -2152,7 +2149,7 @@ aesni_xts_decrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2161,7 +2158,7 @@ aesni_xts_decrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2176,7 +2173,7 @@ aesni_xts_decrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 @@ -2208,7 +2205,7 @@ aesni_xts_decrypt: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2276,10 +2273,10 @@ aesni_xts_decrypt: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2306,7 +2303,7 @@ aesni_xts_decrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_dec_short: @@ -2463,7 +2460,7 @@ aesni_xts_decrypt: jz .Lxts_dec_ret .Lxts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2493,7 +2490,7 @@ aesni_xts_decrypt: jnz .Lxts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2536,11 +2533,827 @@ aesni_xts_decrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size aesni_xts_decrypt,.-aesni_xts_decrypt +.globl aesni_ocb_encrypt +.hidden aesni_ocb_encrypt +.type aesni_ocb_encrypt,@function +.align 32 +aesni_ocb_encrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz .Locb_enc_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_enc_done + +.Locb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc .Locb_enc_short + jmp .Locb_enc_grandloop + +.align 32 +.Locb_enc_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_encrypt6 + + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_enc_grandloop + +.Locb_enc_short: + addq $6,%rdx + jz .Locb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_enc_one + movdqu 16(%rdi),%xmm3 + je .Locb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_enc_three + movdqu 48(%rdi),%xmm5 + je .Locb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_one: + movdqa %xmm10,%xmm7 + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp .Locb_enc_done + +.align 16 +.Locb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +.Locb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Locb_enc_epilogue: + .byte 0xf3,0xc3 +.size aesni_ocb_encrypt,.-aesni_ocb_encrypt + +.type __ocb_encrypt6,@function +.align 32 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_enc_loop6 + +.align 32 +.Locb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 +.size __ocb_encrypt6,.-__ocb_encrypt6 + +.type __ocb_encrypt4,@function +.align 32 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop4 + +.align 32 +.Locb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 +.size __ocb_encrypt4,.-__ocb_encrypt4 + +.type __ocb_encrypt1,@function +.align 32 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop1 + +.align 32 +.Locb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 +.size __ocb_encrypt1,.-__ocb_encrypt1 + +.globl aesni_ocb_decrypt +.hidden aesni_ocb_decrypt +.type aesni_ocb_decrypt,@function +.align 32 +aesni_ocb_decrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz .Locb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_dec_done + +.Locb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc .Locb_dec_short + jmp .Locb_dec_grandloop + +.align 32 +.Locb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_dec_grandloop + +.Locb_dec_short: + addq $6,%rdx + jz .Locb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_dec_one + movdqu 16(%rdi),%xmm3 + je .Locb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_dec_three + movdqu 48(%rdi),%xmm5 + je .Locb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp .Locb_dec_done + +.align 16 +.Locb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +.Locb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Locb_dec_epilogue: + .byte 0xf3,0xc3 +.size aesni_ocb_decrypt,.-aesni_ocb_decrypt + +.type __ocb_decrypt6,@function +.align 32 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_dec_loop6 + +.align 32 +.Locb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 +.size __ocb_decrypt6,.-__ocb_decrypt6 + +.type __ocb_decrypt4,@function +.align 32 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop4 + +.align 32 +.Locb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 +.size __ocb_decrypt4,.-__ocb_decrypt4 + +.type __ocb_decrypt1,@function +.align 32 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop1 + +.align 32 +.Locb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 +.size __ocb_decrypt1,.-__ocb_decrypt1 .globl aesni_cbc_encrypt .hidden aesni_cbc_encrypt .type aesni_cbc_encrypt,@function @@ -2638,11 +3451,11 @@ aesni_cbc_encrypt: jmp .Lcbc_ret .align 16 .Lcbc_decrypt_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $16,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp + movq %rcx,%rbp movups (%r8),%xmm10 movl %r10d,%eax cmpq $0x50,%rdx @@ -2682,7 +3495,7 @@ aesni_cbc_encrypt: pxor %xmm0,%xmm3 movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 - xorq %r11,%r11 + movq $-1,%rbp cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 @@ -2698,10 +3511,10 @@ aesni_cbc_encrypt: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 + adcq $0,%rbp + andq $128,%rbp .byte 102,68,15,56,222,201 - addq %rdi,%r11 + addq %rdi,%rbp movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -2839,18 +3652,18 @@ aesni_cbc_encrypt: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -2969,7 +3782,7 @@ aesni_cbc_encrypt: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3122,8 +3935,8 @@ aesni_cbc_encrypt: .Lcbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lcbc_ret: .byte 0xf3,0xc3 .size aesni_cbc_encrypt,.-aesni_cbc_encrypt diff --git a/linux-x86_64/crypto/aes/bsaes-x86_64.S b/linux-x86_64/crypto/aes/bsaes-x86_64.S index c5491ce4..3f3c73bb 100644 --- a/linux-x86_64/crypto/aes/bsaes-x86_64.S +++ b/linux-x86_64/crypto/aes/bsaes-x86_64.S @@ -1305,15 +1305,14 @@ bsaes_cbc_encrypt: cmpq %rax,%rbp ja .Lcbc_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lcbc_dec_epilogue: .byte 0xf3,0xc3 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt @@ -1506,15 +1505,14 @@ bsaes_ctr32_encrypt_blocks: cmpq %rax,%rbp ja .Lctr_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lctr_enc_epilogue: .byte 0xf3,0xc3 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks @@ -1958,15 +1956,14 @@ bsaes_xts_encrypt: cmpq %rax,%rbp ja .Lxts_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -2437,15 +2434,14 @@ bsaes_xts_decrypt: cmpq %rax,%rbp ja .Lxts_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt diff --git a/linux-x86_64/crypto/bn/x86_64-mont.S b/linux-x86_64/crypto/bn/x86_64-mont.S index 83926ad7..0d2cea2e 100644 --- a/linux-x86_64/crypto/bn/x86_64-mont.S +++ b/linux-x86_64/crypto/bn/x86_64-mont.S @@ -9,6 +9,10 @@ .type bn_mul_mont,@function .align 16 bn_mul_mont: +.cfi_startproc + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax testl $3,%r9d jnz .Lmul_enter cmpl $8,%r9d @@ -22,20 +26,50 @@ bn_mul_mont: .align 16 .Lmul_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movl %r9d,%r9d - leaq 2(%r9),%r10 + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + - movq %r11,8(%rsp,%r9,8) + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -187,51 +221,86 @@ bn_mul_mont: sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .align 16 .Lcopy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul_mont,.-bn_mul_mont .type bn_mul4x_mont,@function .align 16 bn_mul4x_mont: +.cfi_startproc + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movl %r9d,%r9d - leaq 4(%r9),%r10 + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 - movq %r11,8(%rsp,%r9,8) + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -531,9 +600,11 @@ bn_mul4x_mont: cmpq %r9,%r14 jb .Louter4x movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 movq 0(%rsp),%rax + pxor %xmm0,%xmm0 movq 8(%rsp),%rdx - shrq $2,%r9 + shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 @@ -541,7 +612,6 @@ bn_mul4x_mont: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - leaq -1(%r9),%r15 jmp .Lsub4x .align 16 .Lsub4x: @@ -569,47 +639,55 @@ bn_mul4x_mont: movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax - movq %rax,%xmm0 - punpcklqdq %xmm0,%xmm0 movq %rbp,24(%rdi,%r14,8) xorq %r14,%r14 - - movq %r9,%r15 - pxor %xmm5,%xmm5 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -4(%r9),%r15 + orq %rcx,%rsi + shrq $2,%r15 + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) jmp .Lcopy4x .align 16 .Lcopy4x: - movdqu (%rsp,%r14,1),%xmm2 - movdqu 16(%rsp,%r14,1),%xmm4 - movdqu (%rdi,%r14,1),%xmm1 - movdqu 16(%rdi,%r14,1),%xmm3 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - pand %xmm0,%xmm2 - pand %xmm0,%xmm4 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - movdqu %xmm2,(%rdi,%r14,1) - movdqu %xmm4,16(%rdi,%r14,1) - movdqa %xmm5,(%rsp,%r14,1) - movdqa %xmm5,16(%rsp,%r14,1) - + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) leaq 32(%r14),%r14 decq %r15 jnz .Lcopy4x - shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi, 8 movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont .extern bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -617,14 +695,23 @@ bn_mul4x_mont: .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lsqr8x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lsqr8x_prologue: movl %r9d,%r10d shll $3,%r9d @@ -637,30 +724,49 @@ bn_sqr8x_mont: leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lsqr8x_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 +.Lsqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: + movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lsqr8x_body: .byte 102,72,15,110,209 @@ -707,6 +813,7 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 @@ -736,14 +843,22 @@ bn_sqr8x_mont: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lsqr8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/linux-x86_64/crypto/bn/x86_64-mont5.S b/linux-x86_64/crypto/bn/x86_64-mont5.S index 5d7502c3..33ca3c43 100644 --- a/linux-x86_64/crypto/bn/x86_64-mont5.S +++ b/linux-x86_64/crypto/bn/x86_64-mont5.S @@ -9,30 +9,64 @@ .type bn_mul_mont_gather5,@function .align 64 bn_mul_mont_gather5: +.cfi_startproc + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter jmp .Lmul4x_enter .align 16 .Lmul_enter: - movl %r9d,%r9d - movq %rsp,%rax movd 8(%rsp),%xmm5 - leaq .Linc(%rip),%r10 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 + + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + - leaq 2(%r9),%r11 - negq %r11 - leaq -264(%rsp,%r11,8),%rsp - andq $-1024,%rsp + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + leaq .Linc(%rip),%r10 movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: + leaq 128(%rdx),%r12 movdqa 0(%r10),%xmm0 movdqa 16(%r10),%xmm1 @@ -371,45 +405,64 @@ bn_mul_mont_gather5: sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .align 16 .Lcopy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: -.Lmul4x_enter: +.cfi_startproc .byte 0x67 movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmul4x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lmul4x_prologue: .byte 0x67 shll $3,%r9d @@ -426,43 +479,70 @@ bn_mul4x_mont_gather5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lmul4xsp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + negq %r9 movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmul4x_body: call mul4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,@function @@ -995,13 +1075,22 @@ mul4x_internal: .type bn_power5,@function .align 32 bn_power5: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lpower5_prologue: shll $3,%r9d leal (%r9,%r9,2),%r10d @@ -1016,24 +1105,41 @@ bn_power5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lpwr_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1048,6 +1154,7 @@ bn_power5: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpower5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -1074,16 +1181,25 @@ bn_power5: call mul4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -1936,14 +2052,23 @@ bn_from_montgomery: .type bn_from_mont8x,@function .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lfrom_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -1958,24 +2083,41 @@ bn_from_mont8x: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lfrom_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + +.Lfrom_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1990,6 +2132,7 @@ bn_from_mont8x: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lfrom_body: movq %r9,%r11 leaq 48(%rsp),%rax @@ -2025,11 +2168,12 @@ bn_from_mont8x: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) @@ -2040,14 +2184,22 @@ bn_from_mont8x: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x .globl bn_scatter5 .hidden bn_scatter5 diff --git a/linux-x86_64/crypto/chacha/chacha-x86_64.S b/linux-x86_64/crypto/chacha/chacha-x86_64.S index e994940a..25ec715f 100644 --- a/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ b/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -23,6 +23,15 @@ .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Lsigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 @@ -42,6 +51,7 @@ ChaCha20_ctr32: pushq %r14 pushq %r15 subq $64+24,%rsp +.Lctr32_body: movdqu (%rcx),%xmm1 @@ -279,13 +289,14 @@ ChaCha20_ctr32: jnz .Loop_tail .Ldone: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lno_data: .byte 0xf3,0xc3 .size ChaCha20_ctr32,.-ChaCha20_ctr32 @@ -293,18 +304,12 @@ ChaCha20_ctr32: .align 32 ChaCha20_ssse3: .LChaCha20_ssse3: + movq %rsp,%r9 cmpq $128,%rdx ja .LChaCha20_4x .Ldo_sse3_after_all: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $64+24,%rsp + subq $64+8,%rsp movdqa .Lsigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -316,7 +321,7 @@ ChaCha20_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - movl $10,%ebp + movq $10,%r8 jmp .Loop_ssse3 .align 32 @@ -326,7 +331,7 @@ ChaCha20_ssse3: movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 - movl $10,%ebp + movq $10,%r8 movdqa %xmm3,48(%rsp) jmp .Loop_ssse3 @@ -375,7 +380,7 @@ ChaCha20_ssse3: pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 - decl %ebp + decq %r8 jnz .Loop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 @@ -412,31 +417,27 @@ ChaCha20_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - xorq %rbx,%rbx + xorq %r8,%r8 .Loop_tail_ssse3: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%ecx - leaq 1(%rbx),%rbx + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 xorl %ecx,%eax - movb %al,-1(%rdi,%rbx,1) + movb %al,-1(%rdi,%r8,1) decq %rdx jnz .Loop_tail_ssse3 .Ldone_ssse3: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq (%r9),%rsp +.Lssse3_epilogue: .byte 0xf3,0xc3 .size ChaCha20_ssse3,.-ChaCha20_ssse3 .type ChaCha20_4x,@function .align 32 ChaCha20_4x: .LChaCha20_4x: + movq %rsp,%r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -449,8 +450,7 @@ ChaCha20_4x: je .Ldo_sse3_after_all .Lproceed4x: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 @@ -977,18 +977,18 @@ ChaCha20_4x: jnz .Loop_tail4x .Ldone4x: - addq $0x148+0,%rsp + leaq (%r9),%rsp +.L4x_epilogue: .byte 0xf3,0xc3 .size ChaCha20_4x,.-ChaCha20_4x .type ChaCha20_8x,@function .align 32 ChaCha20_8x: .LChaCha20_8x: - movq %rsp,%r10 + movq %rsp,%r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper - movq %r10,640(%rsp) @@ -1579,7 +1579,8 @@ ChaCha20_8x: .Ldone8x: vzeroall - movq 640(%rsp),%rsp + leaq (%r9),%rsp +.L8x_epilogue: .byte 0xf3,0xc3 .size ChaCha20_8x,.-ChaCha20_8x #endif diff --git a/linux-x86_64/crypto/modes/ghash-x86_64.S b/linux-x86_64/crypto/modes/ghash-x86_64.S index b6ca45ff..8842c279 100644 --- a/linux-x86_64/crypto/modes/ghash-x86_64.S +++ b/linux-x86_64/crypto/modes/ghash-x86_64.S @@ -11,6 +11,10 @@ gcm_gmult_4bit: pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -87,8 +91,9 @@ gcm_gmult_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lgmult_epilogue: .byte 0xf3,0xc3 .size gcm_gmult_4bit,.-gcm_gmult_4bit @@ -648,14 +653,14 @@ gcm_ghash_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq 0(%rsi),%rsp .Lghash_epilogue: .byte 0xf3,0xc3 .size gcm_ghash_4bit,.-gcm_ghash_4bit diff --git a/linux-x86_64/crypto/sha/sha1-x86_64.S b/linux-x86_64/crypto/sha/sha1-x86_64.S index d830b534..567bdfd1 100644 --- a/linux-x86_64/crypto/sha/sha1-x86_64.S +++ b/linux-x86_64/crypto/sha/sha1-x86_64.S @@ -1241,14 +1241,13 @@ sha1_block_data_order: .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 leaq -64(%rsp),%rsp - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -1256,7 +1255,7 @@ _ssse3_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1268,8 +1267,8 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -1345,7 +1344,7 @@ _ssse3_shortcut: pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx - movdqa -64(%r11),%xmm10 + movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi @@ -1406,7 +1405,7 @@ _ssse3_shortcut: pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp - movdqa -32(%r11),%xmm8 + movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi @@ -1467,7 +1466,7 @@ _ssse3_shortcut: pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax - movdqa -32(%r11),%xmm9 + movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi @@ -1528,7 +1527,7 @@ _ssse3_shortcut: pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx - movdqa -32(%r11),%xmm10 + movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi @@ -1639,7 +1638,7 @@ _ssse3_shortcut: pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 0(%r11),%xmm10 + movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax @@ -1874,7 +1873,7 @@ _ssse3_shortcut: pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - movdqa 32(%r11),%xmm9 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx @@ -2165,8 +2164,8 @@ _ssse3_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2403,13 +2402,12 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 @@ -2417,7 +2415,7 @@ _ssse3_shortcut: .align 16 sha1_block_data_order_avx: _avx_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 @@ -2425,7 +2423,6 @@ _avx_shortcut: pushq %r14 leaq -64(%rsp),%rsp vzeroupper - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -2433,7 +2430,7 @@ _avx_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -2445,8 +2442,8 @@ _avx_shortcut: xorl %edx,%edi andl %edi,%esi - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -2571,7 +2568,7 @@ _avx_shortcut: vpxor %xmm10,%xmm5,%xmm5 xorl %eax,%ebp shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 + vmovdqa -32(%r14),%xmm11 addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp @@ -2784,7 +2781,7 @@ _avx_shortcut: addl %esi,%eax xorl %edx,%edi vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 + vmovdqa 0(%r14),%xmm11 shrdl $7,%ecx,%ecx addl %ebx,%eax vpxor %xmm8,%xmm2,%xmm2 @@ -3003,7 +3000,7 @@ _avx_shortcut: movl %ebx,%edi xorl %edx,%esi vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 + vmovdqa 32(%r14),%xmm11 shldl $5,%ebx,%ebx addl %esi,%eax vpxor %xmm8,%xmm7,%xmm7 @@ -3282,8 +3279,8 @@ _avx_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -3519,13 +3516,12 @@ _avx_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp .Lepilogue_avx: .byte 0xf3,0xc3 .size sha1_block_data_order_avx,.-sha1_block_data_order_avx diff --git a/linux-x86_64/crypto/sha/sha256-x86_64.S b/linux-x86_64/crypto/sha/sha256-x86_64.S index 445b497e..273b7a5e 100644 --- a/linux-x86_64/crypto/sha/sha256-x86_64.S +++ b/linux-x86_64/crypto/sha/sha256-x86_64.S @@ -19,13 +19,13 @@ sha256_block_data_order: je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -33,7 +33,7 @@ sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) .Lprologue: movl 0(%rdi),%eax @@ -1698,13 +1698,13 @@ sha256_block_data_order: jb .Lloop movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: .byte 0xf3,0xc3 .size sha256_block_data_order,.-sha256_block_data_order @@ -1755,13 +1755,13 @@ K256: .align 64 sha256_block_data_order_ssse3: .Lssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1769,7 +1769,7 @@ sha256_block_data_order_ssse3: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) .Lprologue_ssse3: movl 0(%rdi),%eax @@ -2836,13 +2836,13 @@ sha256_block_data_order_ssse3: jb .Lloop_ssse3 movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 @@ -2850,13 +2850,13 @@ sha256_block_data_order_ssse3: .align 64 sha256_block_data_order_avx: .Lavx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2864,7 +2864,7 @@ sha256_block_data_order_avx: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) .Lprologue_avx: vzeroupper @@ -3893,13 +3893,13 @@ sha256_block_data_order_avx: movq 64+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_avx: .byte 0xf3,0xc3 .size sha256_block_data_order_avx,.-sha256_block_data_order_avx diff --git a/linux-x86_64/crypto/sha/sha512-x86_64.S b/linux-x86_64/crypto/sha/sha512-x86_64.S index d65743fd..f272b640 100644 --- a/linux-x86_64/crypto/sha/sha512-x86_64.S +++ b/linux-x86_64/crypto/sha/sha512-x86_64.S @@ -19,13 +19,13 @@ sha512_block_data_order: orl %r9d,%r10d cmpl $1342177792,%r10d je .Lavx_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +33,7 @@ sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) .Lprologue: movq 0(%rdi),%rax @@ -1698,13 +1698,13 @@ sha512_block_data_order: jb .Lloop movq 128+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: .byte 0xf3,0xc3 .size sha512_block_data_order,.-sha512_block_data_order @@ -1799,13 +1799,13 @@ K512: .align 64 sha512_block_data_order_xop: .Lxop_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1813,7 +1813,7 @@ sha512_block_data_order_xop: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) .Lprologue_xop: vzeroupper @@ -2868,13 +2868,13 @@ sha512_block_data_order_xop: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_xop: .byte 0xf3,0xc3 .size sha512_block_data_order_xop,.-sha512_block_data_order_xop @@ -2882,13 +2882,13 @@ sha512_block_data_order_xop: .align 64 sha512_block_data_order_avx: .Lavx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +2896,7 @@ sha512_block_data_order_avx: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) .Lprologue_avx: vzeroupper @@ -4015,13 +4015,13 @@ sha512_block_data_order_avx: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_avx: .byte 0xf3,0xc3 .size sha512_block_data_order_avx,.-sha512_block_data_order_avx diff --git a/mac-x86/crypto/bn/x86-mont.S b/mac-x86/crypto/bn/x86-mont.S index 234034b0..5c13ca4d 100644 --- a/mac-x86/crypto/bn/x86-mont.S +++ b/mac-x86/crypto/bn/x86-mont.S @@ -16,39 +16,54 @@ L_bn_mul_mont_begin: jl L000just_leave leal 20(%esp),%esi leal 24(%esp),%edx - movl %esp,%ebp addl $2,%edi negl %edi - leal -32(%esp,%edi,4),%esp + leal -32(%esp,%edi,4),%ebp negl %edi - movl %esp,%eax + movl %ebp,%eax subl %edx,%eax andl $2047,%eax - subl %eax,%esp - xorl %esp,%edx + subl %eax,%ebp + xorl %ebp,%edx andl $2048,%edx xorl $2048,%edx - subl %edx,%esp - andl $-64,%esp + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L001page_walk + jmp L002page_walk_done +.align 4,0x90 +L001page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L001page_walk +L002page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx - movl 12(%esi),%edx + movl 12(%esi),%ebp movl 16(%esi),%esi movl (%esi),%esi movl %eax,4(%esp) movl %ebx,8(%esp) movl %ecx,12(%esp) - movl %edx,16(%esp) + movl %ebp,16(%esp) movl %esi,20(%esp) leal -3(%edi),%ebx - movl %ebp,24(%esp) - call L001PIC_me_up -L001PIC_me_up: + movl %edx,24(%esp) + call L003PIC_me_up +L003PIC_me_up: popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax btl $26,(%eax) - jnc L002non_sse2 + jnc L004non_sse2 movl $-1,%eax movd %eax,%mm7 movl 8(%esp),%esi @@ -72,7 +87,7 @@ L001PIC_me_up: psrlq $32,%mm3 incl %ecx .align 4,0x90 -L0031st: +L0051st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -87,7 +102,7 @@ L0031st: psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx - jl L0031st + jl L0051st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -101,7 +116,7 @@ L0031st: paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx -L004outer: +L006outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 @@ -123,7 +138,7 @@ L004outer: paddq %mm6,%mm2 incl %ecx decl %ebx -L005inner: +L007inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -140,7 +155,7 @@ L005inner: paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx - jnz L005inner + jnz L007inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 @@ -158,11 +173,11 @@ L005inner: movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx - jle L004outer + jle L006outer emms - jmp L006common_tail + jmp L008common_tail .align 4,0x90 -L002non_sse2: +L004non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -173,12 +188,12 @@ L002non_sse2: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz L007bn_sqr_mont + jz L009bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 4,0x90 -L008mull: +L010mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -187,7 +202,7 @@ L008mull: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl L008mull + jl L010mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -205,9 +220,9 @@ L008mull: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp L0092ndmadd + jmp L0112ndmadd .align 4,0x90 -L0101stmadd: +L0121stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -218,7 +233,7 @@ L0101stmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl L0101stmadd + jl L0121stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -241,7 +256,7 @@ L0101stmadd: adcl $0,%edx movl $1,%ecx .align 4,0x90 -L0092ndmadd: +L0112ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -252,7 +267,7 @@ L0092ndmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl L0092ndmadd + jl L0112ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -268,16 +283,16 @@ L0092ndmadd: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je L006common_tail + je L008common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp L0101stmadd + jmp L0121stmadd .align 4,0x90 -L007bn_sqr_mont: +L009bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -288,7 +303,7 @@ L007bn_sqr_mont: andl $1,%ebx incl %ecx .align 4,0x90 -L011sqr: +L013sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -300,7 +315,7 @@ L011sqr: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl L011sqr + jl L013sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -324,7 +339,7 @@ L011sqr: movl 4(%esi),%eax movl $1,%ecx .align 4,0x90 -L0123rdmadd: +L0143rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -343,7 +358,7 @@ L0123rdmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl L0123rdmadd + jl L0143rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -359,7 +374,7 @@ L0123rdmadd: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je L006common_tail + je L008common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -371,12 +386,12 @@ L0123rdmadd: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je L013sqrlast + je L015sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 4,0x90 -L014sqradd: +L016sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -392,13 +407,13 @@ L014sqradd: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle L014sqradd + jle L016sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -L013sqrlast: +L015sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -413,9 +428,9 @@ L013sqrlast: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp L0123rdmadd + jmp L0143rdmadd .align 4,0x90 -L006common_tail: +L008common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -423,25 +438,26 @@ L006common_tail: movl %ebx,%ecx xorl %edx,%edx .align 4,0x90 -L015sub: +L017sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge L015sub + jge L017sub sbbl $0,%eax + andl %eax,%esi + notl %eax + movl %edi,%ebp + andl %eax,%ebp + orl %ebp,%esi .align 4,0x90 -L016copy: - movl (%esi,%ebx,4),%edx - movl (%edi,%ebx,4),%ebp - xorl %ebp,%edx - andl %eax,%edx - xorl %ebp,%edx - movl %ecx,(%esi,%ebx,4) - movl %edx,(%edi,%ebx,4) +L018copy: + movl (%esi,%ebx,4),%eax + movl %eax,(%edi,%ebx,4) + movl %ecx,32(%esp,%ebx,4) decl %ebx - jge L016copy + jge L018copy movl 24(%esp),%esp movl $1,%eax L000just_leave: diff --git a/mac-x86_64/crypto/aes/aes-x86_64.S b/mac-x86_64/crypto/aes/aes-x86_64.S index b5d188a0..52df2ae3 100644 --- a/mac-x86_64/crypto/aes/aes-x86_64.S +++ b/mac-x86_64/crypto/aes/aes-x86_64.S @@ -332,6 +332,7 @@ L$enc_compact_done: .private_extern _asm_AES_encrypt _asm_AES_encrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -340,7 +341,6 @@ _asm_AES_encrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -350,7 +350,7 @@ _asm_AES_encrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) L$enc_prologue: movq %rdx,%r15 @@ -382,13 +382,13 @@ L$enc_prologue: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$enc_epilogue: .byte 0xf3,0xc3 @@ -778,6 +778,7 @@ L$dec_compact_done: .private_extern _asm_AES_decrypt _asm_AES_decrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -786,7 +787,6 @@ _asm_AES_decrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -796,7 +796,7 @@ _asm_AES_decrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) L$dec_prologue: movq %rdx,%r15 @@ -830,13 +830,13 @@ L$dec_prologue: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$dec_epilogue: .byte 0xf3,0xc3 @@ -1312,10 +1312,9 @@ L$cbc_prologue: movl %r9d,%r9d leaq L$AES_Te(%rip),%r14 + leaq L$AES_Td(%rip),%r10 cmpq $0,%r9 - jne L$cbc_picked_te - leaq L$AES_Td(%rip),%r14 -L$cbc_picked_te: + cmoveq %r10,%r14 movl _OPENSSL_ia32cap_P(%rip),%r10d cmpq $512,%rdx diff --git a/mac-x86_64/crypto/aes/aesni-x86_64.S b/mac-x86_64/crypto/aes/aesni-x86_64.S index 3d98fa12..4e3b7d06 100644 --- a/mac-x86_64/crypto/aes/aesni-x86_64.S +++ b/mac-x86_64/crypto/aes/aesni-x86_64.S @@ -1031,11 +1031,10 @@ L$oop_enc1_7: .p2align 4 L$ctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1044,7 +1043,7 @@ L$ctr32_bulk: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1060,8 +1059,8 @@ L$ctr32_bulk: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1070,25 +1069,25 @@ L$ctr32_bulk: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d movl _OPENSSL_ia32cap_P+4(%rip),%r10d - xorl %r11d,%r9d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1112,7 +1111,7 @@ L$ctr32_bulk: L$ctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp L$ctr32_loop6 @@ -1123,32 +1122,32 @@ L$ctr32_loop6: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1209,7 +1208,7 @@ L$ctr32_loop8: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1222,7 +1221,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1236,7 +1235,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1250,7 +1249,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1264,7 +1263,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1278,7 +1277,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1292,7 +1291,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1307,7 +1306,7 @@ L$ctr32_loop8: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1542,7 +1541,7 @@ L$ctr32_loop3: L$ctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1566,8 +1565,8 @@ L$ctr32_done: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$ctr32_epilogue: .byte 0xf3,0xc3 @@ -1576,11 +1575,10 @@ L$ctr32_epilogue: .p2align 4 _aesni_xts_encrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1596,7 +1594,7 @@ L$oop_enc1_8: jnz L$oop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1652,9 +1650,9 @@ L$oop_enc1_8: jc L$xts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_enc_grandloop @@ -1679,7 +1677,7 @@ L$xts_enc_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1688,7 +1686,7 @@ L$xts_enc_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1703,7 +1701,7 @@ L$xts_enc_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_enc_loop6 @@ -1735,7 +1733,7 @@ L$xts_enc_loop6: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1803,10 +1801,10 @@ L$xts_enc_loop6: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1833,7 +1831,7 @@ L$xts_enc_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_enc_short: @@ -1989,7 +1987,7 @@ L$xts_enc_steal: jnz L$xts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2032,8 +2030,8 @@ L$xts_enc_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$xts_enc_epilogue: .byte 0xf3,0xc3 @@ -2042,11 +2040,10 @@ L$xts_enc_epilogue: .p2align 4 _aesni_xts_decrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2068,7 +2065,7 @@ L$oop_enc1_11: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2124,9 +2121,9 @@ L$oop_enc1_11: jc L$xts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_dec_grandloop @@ -2151,7 +2148,7 @@ L$xts_dec_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2160,7 +2157,7 @@ L$xts_dec_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2175,7 +2172,7 @@ L$xts_dec_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_dec_loop6 @@ -2207,7 +2204,7 @@ L$xts_dec_loop6: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2275,10 +2272,10 @@ L$xts_dec_loop6: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2305,7 +2302,7 @@ L$xts_dec_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_dec_short: @@ -2462,7 +2459,7 @@ L$xts_dec_done: jz L$xts_dec_ret L$xts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2492,7 +2489,7 @@ L$xts_dec_steal: jnz L$xts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2535,11 +2532,827 @@ L$xts_dec_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$xts_dec_epilogue: .byte 0xf3,0xc3 +.globl _aesni_ocb_encrypt +.private_extern _aesni_ocb_encrypt + +.p2align 5 +_aesni_ocb_encrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz L$ocb_enc_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + leaq 16(%rsi),%rsi + subq $1,%rdx + jz L$ocb_enc_done + +L$ocb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc L$ocb_enc_short + jmp L$ocb_enc_grandloop + +.p2align 5 +L$ocb_enc_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_encrypt6 + + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ocb_enc_grandloop + +L$ocb_enc_short: + addq $6,%rdx + jz L$ocb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb L$ocb_enc_one + movdqu 16(%rdi),%xmm3 + je L$ocb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb L$ocb_enc_three + movdqu 48(%rdi),%xmm5 + je L$ocb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_one: + movdqa %xmm10,%xmm7 + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +L$ocb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$ocb_enc_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp L$ocb_enc_loop6 + +.p2align 5 +L$ocb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp L$ocb_enc_loop4 + +.p2align 5 +L$ocb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp L$ocb_enc_loop1 + +.p2align 5 +L$ocb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 + + +.globl _aesni_ocb_decrypt +.private_extern _aesni_ocb_decrypt + +.p2align 5 +_aesni_ocb_decrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz L$ocb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz L$ocb_dec_done + +L$ocb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc L$ocb_dec_short + jmp L$ocb_dec_grandloop + +.p2align 5 +L$ocb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ocb_dec_grandloop + +L$ocb_dec_short: + addq $6,%rdx + jz L$ocb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb L$ocb_dec_one + movdqu 16(%rdi),%xmm3 + je L$ocb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb L$ocb_dec_three + movdqu 48(%rdi),%xmm5 + je L$ocb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +L$ocb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$ocb_dec_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp L$ocb_dec_loop6 + +.p2align 5 +L$ocb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp L$ocb_dec_loop4 + +.p2align 5 +L$ocb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp L$ocb_dec_loop1 + +.p2align 5 +L$ocb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 + .globl _aesni_cbc_encrypt .private_extern _aesni_cbc_encrypt @@ -2637,11 +3450,11 @@ L$oop_dec1_16: jmp L$cbc_ret .p2align 4 L$cbc_decrypt_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $16,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp + movq %rcx,%rbp movups (%r8),%xmm10 movl %r10d,%eax cmpq $0x50,%rdx @@ -2681,7 +3494,7 @@ L$cbc_dec_loop8_enter: pxor %xmm0,%xmm3 movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 - xorq %r11,%r11 + movq $-1,%rbp cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 @@ -2697,10 +3510,10 @@ L$cbc_dec_loop8_enter: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 + adcq $0,%rbp + andq $128,%rbp .byte 102,68,15,56,222,201 - addq %rdi,%r11 + addq %rdi,%rbp movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -2838,18 +3651,18 @@ L$cbc_dec_done: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -2968,7 +3781,7 @@ L$cbc_dec_loop6_enter: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3121,8 +3934,8 @@ L$cbc_dec_tail_partial: L$cbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$cbc_ret: .byte 0xf3,0xc3 diff --git a/mac-x86_64/crypto/aes/bsaes-x86_64.S b/mac-x86_64/crypto/aes/bsaes-x86_64.S index ad802e3d..6e679c18 100644 --- a/mac-x86_64/crypto/aes/bsaes-x86_64.S +++ b/mac-x86_64/crypto/aes/bsaes-x86_64.S @@ -1302,15 +1302,14 @@ L$cbc_dec_bzero: cmpq %rax,%rbp ja L$cbc_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$cbc_dec_epilogue: .byte 0xf3,0xc3 @@ -1503,15 +1502,14 @@ L$ctr_enc_bzero: cmpq %rax,%rbp ja L$ctr_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$ctr_enc_epilogue: .byte 0xf3,0xc3 @@ -1955,15 +1953,14 @@ L$xts_enc_bzero: cmpq %rax,%rbp ja L$xts_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$xts_enc_epilogue: .byte 0xf3,0xc3 @@ -2434,15 +2431,14 @@ L$xts_dec_bzero: cmpq %rax,%rbp ja L$xts_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$xts_dec_epilogue: .byte 0xf3,0xc3 diff --git a/mac-x86_64/crypto/bn/x86_64-mont.S b/mac-x86_64/crypto/bn/x86_64-mont.S index 51e5d199..41a09267 100644 --- a/mac-x86_64/crypto/bn/x86_64-mont.S +++ b/mac-x86_64/crypto/bn/x86_64-mont.S @@ -8,6 +8,10 @@ .p2align 4 _bn_mul_mont: + + movl %r9d,%r9d + movq %rsp,%rax + testl $3,%r9d jnz L$mul_enter cmpl $8,%r9d @@ -21,20 +25,50 @@ _bn_mul_mont: .p2align 4 L$mul_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movl %r9d,%r9d - leaq 2(%r9),%r10 + + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +.p2align 4 +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) - movq %r11,8(%rsp,%r9,8) L$mul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -186,51 +220,86 @@ L$sub: sbbq (%rcx,%r14,8),%rax sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .p2align 4 L$copy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 4 bn_mul4x_mont: + + movl %r9d,%r9d + movq %rsp,%rax + L$mul4x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movl %r9d,%r9d - leaq 4(%r9),%r10 + + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) - movq %r11,8(%rsp,%r9,8) L$mul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -530,9 +599,11 @@ L$inner4x: cmpq %r9,%r14 jb L$outer4x movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 movq 0(%rsp),%rax + pxor %xmm0,%xmm0 movq 8(%rsp),%rdx - shrq $2,%r9 + shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 @@ -540,7 +611,6 @@ L$inner4x: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - leaq -1(%r9),%r15 jmp L$sub4x .p2align 4 L$sub4x: @@ -568,62 +638,79 @@ L$sub4x: movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax - movq %rax,%xmm0 - punpcklqdq %xmm0,%xmm0 movq %rbp,24(%rdi,%r14,8) xorq %r14,%r14 - - movq %r9,%r15 - pxor %xmm5,%xmm5 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -4(%r9),%r15 + orq %rcx,%rsi + shrq $2,%r15 + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) jmp L$copy4x .p2align 4 L$copy4x: - movdqu (%rsp,%r14,1),%xmm2 - movdqu 16(%rsp,%r14,1),%xmm4 - movdqu (%rdi,%r14,1),%xmm1 - movdqu 16(%rdi,%r14,1),%xmm3 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - pand %xmm0,%xmm2 - pand %xmm0,%xmm4 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - movdqu %xmm2,(%rdi,%r14,1) - movdqu %xmm4,16(%rdi,%r14,1) - movdqa %xmm5,(%rsp,%r14,1) - movdqa %xmm5,16(%rsp,%r14,1) - + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) leaq 32(%r14),%r14 decq %r15 jnz L$copy4x - shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi + movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_sqr8x_mont: -L$sqr8x_enter: + movq %rsp,%rax + +L$sqr8x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$sqr8x_prologue: + movl %r9d,%r10d shll $3,%r9d shlq $3+2,%r10 @@ -635,30 +722,49 @@ L$sqr8x_enter: leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$sqr8x_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk + jmp L$sqr8x_page_walk_done + +.p2align 4 +L$sqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk +L$sqr8x_page_walk_done: + movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) + L$sqr8x_body: .byte 102,72,15,110,209 @@ -705,6 +811,7 @@ L$sqr8x_sub: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi + jmp L$sqr8x_cond_copy .p2align 5 @@ -734,15 +841,23 @@ L$sqr8x_cond_copy: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$sqr8x_epilogue: .byte 0xf3,0xc3 + .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif diff --git a/mac-x86_64/crypto/bn/x86_64-mont5.S b/mac-x86_64/crypto/bn/x86_64-mont5.S index a154cc8d..24b56de2 100644 --- a/mac-x86_64/crypto/bn/x86_64-mont5.S +++ b/mac-x86_64/crypto/bn/x86_64-mont5.S @@ -8,30 +8,64 @@ .p2align 6 _bn_mul_mont_gather5: + + movl %r9d,%r9d + movq %rsp,%rax + testl $7,%r9d jnz L$mul_enter jmp L$mul4x_enter .p2align 4 L$mul_enter: - movl %r9d,%r9d - movq %rsp,%rax movd 8(%rsp),%xmm5 - leaq L$inc(%rip),%r10 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - leaq 2(%r9),%r11 - negq %r11 - leaq -264(%rsp,%r11,8),%rsp - andq $-1024,%rsp + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + leaq L$inc(%rip),%r10 movq %rax,8(%rsp,%r9,8) + L$mul_body: + leaq 128(%rdx),%r12 movdqa 0(%r10),%xmm0 movdqa 16(%r10),%xmm1 @@ -370,46 +404,65 @@ L$sub: sbbq (%rcx,%r14,8),%rax sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .p2align 4 L$copy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_mul4x_mont_gather5: -L$mul4x_enter: + .byte 0x67 movq %rsp,%rax + +L$mul4x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$mul4x_prologue: + .byte 0x67 shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -425,46 +478,73 @@ L$mul4x_enter: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mul4xsp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp L$mul4xsp_done .p2align 5 L$mul4xsp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$mul4xsp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + negq %r9 movq %rax,40(%rsp) + L$mul4x_body: call mul4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 mul4x_internal: shlq $5,%r9 @@ -994,14 +1074,23 @@ L$inner4x: .p2align 5 _bn_power5: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$power5_prologue: + shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 @@ -1015,24 +1104,41 @@ _bn_power5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwr_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp L$pwr_sp_done .p2align 5 L$pwr_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$pwr_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk + jmp L$pwr_page_walk_done + +L$pwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk +L$pwr_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1047,6 +1153,7 @@ L$pwr_sp_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$power5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -1073,18 +1180,27 @@ L$power5_body: call mul4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$power5_epilogue: .byte 0xf3,0xc3 + .globl _bn_sqr8x_internal .private_extern _bn_sqr8x_internal .private_extern _bn_sqr8x_internal @@ -1935,15 +2051,24 @@ _bn_from_montgomery: .p2align 5 bn_from_mont8x: + .byte 0x67 movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$from_prologue: + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -1957,24 +2082,41 @@ bn_from_mont8x: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$from_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp L$from_sp_done .p2align 5 L$from_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$from_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$from_page_walk + jmp L$from_page_walk_done + +L$from_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$from_page_walk +L$from_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1989,6 +2131,7 @@ L$from_sp_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$from_body: movq %r9,%r11 leaq 48(%rsp),%rax @@ -2024,11 +2167,12 @@ L$mul_by_1: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp L$from_mont_zero .p2align 5 L$from_mont_zero: + movq 40(%rsp),%rsi + movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) @@ -2039,15 +2183,23 @@ L$from_mont_zero: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$from_epilogue: .byte 0xf3,0xc3 + .globl _bn_scatter5 .private_extern _bn_scatter5 diff --git a/mac-x86_64/crypto/chacha/chacha-x86_64.S b/mac-x86_64/crypto/chacha/chacha-x86_64.S index c3554c8d..51c0caa7 100644 --- a/mac-x86_64/crypto/chacha/chacha-x86_64.S +++ b/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -22,6 +22,15 @@ L$rot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe L$sigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.p2align 6 +L$zeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +L$fourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +L$incz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +L$sixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .globl _ChaCha20_ctr32 .private_extern _ChaCha20_ctr32 @@ -41,6 +50,7 @@ _ChaCha20_ctr32: pushq %r14 pushq %r15 subq $64+24,%rsp +L$ctr32_body: movdqu (%rcx),%xmm1 @@ -278,13 +288,14 @@ L$oop_tail: jnz L$oop_tail L$done: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$no_data: .byte 0xf3,0xc3 @@ -292,18 +303,12 @@ L$no_data: .p2align 5 ChaCha20_ssse3: L$ChaCha20_ssse3: + movq %rsp,%r9 cmpq $128,%rdx ja L$ChaCha20_4x L$do_sse3_after_all: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $64+24,%rsp + subq $64+8,%rsp movdqa L$sigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -315,7 +320,7 @@ L$do_sse3_after_all: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - movl $10,%ebp + movq $10,%r8 jmp L$oop_ssse3 .p2align 5 @@ -325,7 +330,7 @@ L$oop_outer_ssse3: movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 - movl $10,%ebp + movq $10,%r8 movdqa %xmm3,48(%rsp) jmp L$oop_ssse3 @@ -374,7 +379,7 @@ L$oop_ssse3: pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 - decl %ebp + decq %r8 jnz L$oop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 @@ -411,31 +416,27 @@ L$tail_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - xorq %rbx,%rbx + xorq %r8,%r8 L$oop_tail_ssse3: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%ecx - leaq 1(%rbx),%rbx + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 xorl %ecx,%eax - movb %al,-1(%rdi,%rbx,1) + movb %al,-1(%rdi,%r8,1) decq %rdx jnz L$oop_tail_ssse3 L$done_ssse3: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq (%r9),%rsp +L$ssse3_epilogue: .byte 0xf3,0xc3 .p2align 5 ChaCha20_4x: L$ChaCha20_4x: + movq %rsp,%r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -448,8 +449,7 @@ L$ChaCha20_4x: je L$do_sse3_after_all L$proceed4x: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + subq $0x140+8,%rsp movdqa L$sigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 @@ -976,18 +976,18 @@ L$oop_tail4x: jnz L$oop_tail4x L$done4x: - addq $0x148+0,%rsp + leaq (%r9),%rsp +L$4x_epilogue: .byte 0xf3,0xc3 .p2align 5 ChaCha20_8x: L$ChaCha20_8x: - movq %rsp,%r10 + movq %rsp,%r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper - movq %r10,640(%rsp) @@ -1578,7 +1578,8 @@ L$oop_tail8x: L$done8x: vzeroall - movq 640(%rsp),%rsp + leaq (%r9),%rsp +L$8x_epilogue: .byte 0xf3,0xc3 #endif diff --git a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S index 03cd8725..62d114d9 100644 --- a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S +++ b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S @@ -44,7 +44,7 @@ chacha20_poly1305_constants: .p2align 6 poly_hash_ad_internal: -.cfi_startproc + xorq %r10,%r10 xorq %r11,%r11 xorq %r12,%r12 @@ -207,7 +207,7 @@ hash_ad_tail_loop: 1: .byte 0xf3,0xc3 -.cfi_endproc + .globl _chacha20_poly1305_open @@ -215,31 +215,31 @@ hash_ad_tail_loop: .p2align 6 _chacha20_poly1305_open: -.cfi_startproc + pushq %rbp -.cfi_adjust_cfa_offset 8 + pushq %rbx -.cfi_adjust_cfa_offset 8 + pushq %r12 -.cfi_adjust_cfa_offset 8 + pushq %r13 -.cfi_adjust_cfa_offset 8 + pushq %r14 -.cfi_adjust_cfa_offset 8 + pushq %r15 -.cfi_adjust_cfa_offset 8 + pushq %r9 -.cfi_adjust_cfa_offset 8 + subq $288 + 32,%rsp -.cfi_adjust_cfa_offset 288 + 32 -.cfi_offset rbp, -16 -.cfi_offset rbx, -24 -.cfi_offset r12, -32 -.cfi_offset r13, -40 -.cfi_offset r14, -48 -.cfi_offset r15, -56 + + + + + + + leaq 32(%rsp),%rbp andq $-32,%rbp movq %rdx,8+32(%rbp) @@ -1834,26 +1834,26 @@ open_sse_finalize: adcq 8+16(%rbp),%r11 addq $288 + 32,%rsp -.cfi_adjust_cfa_offset -(288 + 32) + popq %r9 -.cfi_adjust_cfa_offset -8 + movq %r10,(%r9) movq %r11,8(%r9) popq %r15 -.cfi_adjust_cfa_offset -8 + popq %r14 -.cfi_adjust_cfa_offset -8 + popq %r13 -.cfi_adjust_cfa_offset -8 + popq %r12 -.cfi_adjust_cfa_offset -8 + popq %rbx -.cfi_adjust_cfa_offset -8 + popq %rbp -.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 -.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 + open_sse_128: movdqu .chacha20_consts(%rip),%xmm0 @@ -2086,7 +2086,7 @@ open_sse_128: jmp 1b jmp open_sse_tail_16 -.cfi_endproc + @@ -2096,31 +2096,31 @@ open_sse_128: .p2align 6 _chacha20_poly1305_seal: -.cfi_startproc + pushq %rbp -.cfi_adjust_cfa_offset 8 + pushq %rbx -.cfi_adjust_cfa_offset 8 + pushq %r12 -.cfi_adjust_cfa_offset 8 + pushq %r13 -.cfi_adjust_cfa_offset 8 + pushq %r14 -.cfi_adjust_cfa_offset 8 + pushq %r15 -.cfi_adjust_cfa_offset 8 + pushq %r9 -.cfi_adjust_cfa_offset 8 + subq $288 + 32,%rsp -.cfi_adjust_cfa_offset 288 + 32 -.cfi_offset rbp, -16 -.cfi_offset rbx, -24 -.cfi_offset r12, -32 -.cfi_offset r13, -40 -.cfi_offset r14, -48 -.cfi_offset r15, -56 + + + + + + + leaq 32(%rsp),%rbp andq $-32,%rbp movq %rdx,8+32(%rbp) @@ -3717,26 +3717,26 @@ seal_sse_finalize: adcq 8+16(%rbp),%r11 addq $288 + 32,%rsp -.cfi_adjust_cfa_offset -(288 + 32) + popq %r9 -.cfi_adjust_cfa_offset -8 + movq %r10,0(%r9) movq %r11,8(%r9) popq %r15 -.cfi_adjust_cfa_offset -8 + popq %r14 -.cfi_adjust_cfa_offset -8 + popq %r13 -.cfi_adjust_cfa_offset -8 + popq %r12 -.cfi_adjust_cfa_offset -8 + popq %rbx -.cfi_adjust_cfa_offset -8 + popq %rbp -.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 -.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 + seal_sse_128: movdqu .chacha20_consts(%rip),%xmm0 @@ -8783,5 +8783,5 @@ seal_avx2_short_tail: 1: vzeroupper jmp seal_sse_tail_16 -.cfi_endproc + #endif diff --git a/mac-x86_64/crypto/modes/ghash-x86_64.S b/mac-x86_64/crypto/modes/ghash-x86_64.S index 334f83ff..814d7961 100644 --- a/mac-x86_64/crypto/modes/ghash-x86_64.S +++ b/mac-x86_64/crypto/modes/ghash-x86_64.S @@ -10,6 +10,10 @@ _gcm_gmult_4bit: pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp L$gmult_prologue: movzbq 15(%rdi),%r8 @@ -86,8 +90,9 @@ L$break1: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$gmult_epilogue: .byte 0xf3,0xc3 @@ -647,14 +652,14 @@ L$outer_loop: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq 0(%rsi),%rsp L$ghash_epilogue: .byte 0xf3,0xc3 diff --git a/mac-x86_64/crypto/sha/sha1-x86_64.S b/mac-x86_64/crypto/sha/sha1-x86_64.S index 0509d451..cf45d8ab 100644 --- a/mac-x86_64/crypto/sha/sha1-x86_64.S +++ b/mac-x86_64/crypto/sha/sha1-x86_64.S @@ -1240,14 +1240,13 @@ L$epilogue: .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 leaq -64(%rsp),%rsp - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -1255,7 +1254,7 @@ _ssse3_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1267,8 +1266,8 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -1344,7 +1343,7 @@ L$oop_ssse3: pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx - movdqa -64(%r11),%xmm10 + movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi @@ -1405,7 +1404,7 @@ L$oop_ssse3: pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp - movdqa -32(%r11),%xmm8 + movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi @@ -1466,7 +1465,7 @@ L$oop_ssse3: pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax - movdqa -32(%r11),%xmm9 + movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi @@ -1527,7 +1526,7 @@ L$oop_ssse3: pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx - movdqa -32(%r11),%xmm10 + movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi @@ -1638,7 +1637,7 @@ L$oop_ssse3: pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 0(%r11),%xmm10 + movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax @@ -1873,7 +1872,7 @@ L$oop_ssse3: pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - movdqa 32(%r11),%xmm9 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx @@ -2164,8 +2163,8 @@ L$oop_ssse3: addl %edx,%ecx cmpq %r10,%r9 je L$done_ssse3 - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2402,13 +2401,12 @@ L$done_ssse3: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp L$epilogue_ssse3: .byte 0xf3,0xc3 @@ -2416,7 +2414,7 @@ L$epilogue_ssse3: .p2align 4 sha1_block_data_order_avx: _avx_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 @@ -2424,7 +2422,6 @@ _avx_shortcut: pushq %r14 leaq -64(%rsp),%rsp vzeroupper - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -2432,7 +2429,7 @@ _avx_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -2444,8 +2441,8 @@ _avx_shortcut: xorl %edx,%edi andl %edi,%esi - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -2570,7 +2567,7 @@ L$oop_avx: vpxor %xmm10,%xmm5,%xmm5 xorl %eax,%ebp shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 + vmovdqa -32(%r14),%xmm11 addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp @@ -2783,7 +2780,7 @@ L$oop_avx: addl %esi,%eax xorl %edx,%edi vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 + vmovdqa 0(%r14),%xmm11 shrdl $7,%ecx,%ecx addl %ebx,%eax vpxor %xmm8,%xmm2,%xmm2 @@ -3002,7 +2999,7 @@ L$oop_avx: movl %ebx,%edi xorl %edx,%esi vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 + vmovdqa 32(%r14),%xmm11 shldl $5,%ebx,%ebx addl %esi,%eax vpxor %xmm8,%xmm7,%xmm7 @@ -3281,8 +3278,8 @@ L$oop_avx: addl %edx,%ecx cmpq %r10,%r9 je L$done_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -3518,13 +3515,12 @@ L$done_avx: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp L$epilogue_avx: .byte 0xf3,0xc3 diff --git a/mac-x86_64/crypto/sha/sha256-x86_64.S b/mac-x86_64/crypto/sha/sha256-x86_64.S index 0146ff5c..f00ef6da 100644 --- a/mac-x86_64/crypto/sha/sha256-x86_64.S +++ b/mac-x86_64/crypto/sha/sha256-x86_64.S @@ -18,13 +18,13 @@ _sha256_block_data_order: je L$avx_shortcut testl $512,%r10d jnz L$ssse3_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -32,7 +32,7 @@ _sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) L$prologue: movl 0(%rdi),%eax @@ -1697,13 +1697,13 @@ L$rounds_16_xx: jb L$loop movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue: .byte 0xf3,0xc3 @@ -1754,13 +1754,13 @@ K256: .p2align 6 sha256_block_data_order_ssse3: L$ssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1768,7 +1768,7 @@ L$ssse3_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) L$prologue_ssse3: movl 0(%rdi),%eax @@ -2835,13 +2835,13 @@ L$ssse3_00_47: jb L$loop_ssse3 movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_ssse3: .byte 0xf3,0xc3 @@ -2849,13 +2849,13 @@ L$epilogue_ssse3: .p2align 6 sha256_block_data_order_avx: L$avx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2863,7 +2863,7 @@ L$avx_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) L$prologue_avx: vzeroupper @@ -3892,13 +3892,13 @@ L$avx_00_47: movq 64+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_avx: .byte 0xf3,0xc3 diff --git a/mac-x86_64/crypto/sha/sha512-x86_64.S b/mac-x86_64/crypto/sha/sha512-x86_64.S index aeabd3f4..eabcb3af 100644 --- a/mac-x86_64/crypto/sha/sha512-x86_64.S +++ b/mac-x86_64/crypto/sha/sha512-x86_64.S @@ -18,13 +18,13 @@ _sha512_block_data_order: orl %r9d,%r10d cmpl $1342177792,%r10d je L$avx_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -32,7 +32,7 @@ _sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) L$prologue: movq 0(%rdi),%rax @@ -1697,13 +1697,13 @@ L$rounds_16_xx: jb L$loop movq 128+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue: .byte 0xf3,0xc3 @@ -1798,13 +1798,13 @@ K512: .p2align 6 sha512_block_data_order_xop: L$xop_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1812,7 +1812,7 @@ L$xop_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) L$prologue_xop: vzeroupper @@ -2867,13 +2867,13 @@ L$xop_00_47: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_xop: .byte 0xf3,0xc3 @@ -2881,13 +2881,13 @@ L$epilogue_xop: .p2align 6 sha512_block_data_order_avx: L$avx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2895,7 +2895,7 @@ L$avx_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) L$prologue_avx: vzeroupper @@ -4014,13 +4014,13 @@ L$avx_00_47: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_avx: .byte 0xf3,0xc3 @@ -51,6 +51,7 @@ cc_defaults { "src/crypto/asn1/tasn_new.c", "src/crypto/asn1/tasn_typ.c", "src/crypto/asn1/tasn_utl.c", + "src/crypto/asn1/time_support.c", "src/crypto/asn1/x_bignum.c", "src/crypto/asn1/x_long.c", "src/crypto/base64/base64.c", @@ -202,7 +203,6 @@ cc_defaults { "src/crypto/thread_none.c", "src/crypto/thread_pthread.c", "src/crypto/thread_win.c", - "src/crypto/time_support.c", "src/crypto/x509/a_digest.c", "src/crypto/x509/a_sign.c", "src/crypto/x509/a_strex.c", @@ -466,8 +466,12 @@ cc_defaults { cc_defaults { name: "boringssl_crypto_test_sources", srcs: [ + "src/crypto/chacha/chacha_test.cc", "src/crypto/dh/dh_test.cc", "src/crypto/dsa/dsa_test.cc", + "src/crypto/ec/ec_test.cc", + "src/crypto/err/err_test.cc", + "src/crypto/rsa/rsa_test.cc", "src/crypto/test/gtest_main.cc", ], } @@ -489,7 +493,6 @@ cc_defaults { "src/crypto/bio/bio_test.cc", "src/crypto/bn/bn_test.cc", "src/crypto/bytestring/bytestring_test.cc", - "src/crypto/chacha/chacha_test.cc", "src/crypto/cipher/aead_test.cc", "src/crypto/cipher/cipher_test.cc", "src/crypto/cmac/cmac_test.cc", @@ -498,14 +501,12 @@ cc_defaults { "src/crypto/curve25519/spake25519_test.cc", "src/crypto/curve25519/x25519_test.cc", "src/crypto/digest/digest_test.cc", - "src/crypto/ec/ec_test.cc", "src/crypto/ec/example_mul.c", "src/crypto/ec/p256-x86_64_test.cc", "src/crypto/ecdh/ecdh_test.cc", "src/crypto/ecdsa/ecdsa_sign_test.cc", "src/crypto/ecdsa/ecdsa_test.cc", "src/crypto/ecdsa/ecdsa_verify_test.cc", - "src/crypto/err/err_test.cc", "src/crypto/evp/evp_extra_test.cc", "src/crypto/evp/evp_test.cc", "src/crypto/evp/pbkdf_test.cc", @@ -519,7 +520,6 @@ cc_defaults { "src/crypto/poly1305/poly1305_test.cc", "src/crypto/pool/pool_test.cc", "src/crypto/refcount_test.cc", - "src/crypto/rsa/rsa_test.cc", "src/crypto/thread_test.c", "src/crypto/x509/pkcs7_test.c", "src/crypto/x509/x509_test.cc", @@ -49,6 +49,7 @@ crypto_sources := \ src/crypto/asn1/tasn_new.c\ src/crypto/asn1/tasn_typ.c\ src/crypto/asn1/tasn_utl.c\ + src/crypto/asn1/time_support.c\ src/crypto/asn1/x_bignum.c\ src/crypto/asn1/x_long.c\ src/crypto/base64/base64.c\ @@ -200,7 +201,6 @@ crypto_sources := \ src/crypto/thread_none.c\ src/crypto/thread_pthread.c\ src/crypto/thread_win.c\ - src/crypto/time_support.c\ src/crypto/x509/a_digest.c\ src/crypto/x509/a_sign.c\ src/crypto/x509/a_strex.c\ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e15df7a5..2abf6166 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -240,10 +240,6 @@ endif() # googletest has a very straightforward build. add_library(gtest third_party/googletest/src/gtest-all.cc) target_include_directories(gtest PRIVATE third_party/googletest) -if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - # TODO(davidben): Make googletest pass -Wmissing-declarations. - set_target_properties(gtest PROPERTIES COMPILE_FLAGS "-Wno-missing-declarations") -endif() include_directories(third_party/googletest/include) diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt index bbc68d00..fbfc4b27 100644 --- a/src/crypto/CMakeLists.txt +++ b/src/crypto/CMakeLists.txt @@ -129,7 +129,6 @@ add_library( thread_none.c thread_pthread.c thread_win.c - time_support.c $<TARGET_OBJECTS:stack> $<TARGET_OBJECTS:lhash> @@ -212,9 +211,12 @@ add_dependencies(all_tests refcount_test) add_executable( crypto_test + chacha/chacha_test.cc dh/dh_test.cc dsa/dsa_test.cc + ec/ec_test.cc err/err_test.cc + rsa/rsa_test.cc $<TARGET_OBJECTS:gtest_main> $<TARGET_OBJECTS:test_support> diff --git a/src/crypto/aes/asm/aes-586.pl b/src/crypto/aes/asm/aes-586.pl index 9e6e1cc0..45c19fb1 100755 --- a/src/crypto/aes/asm/aes-586.pl +++ b/src/crypto/aes/asm/aes-586.pl @@ -116,7 +116,7 @@ # words every cache-line is *guaranteed* to be accessed within ~50 # cycles window. Why just SSE? Because it's needed on hyper-threading # CPU! Which is also why it's prefetched with 64 byte stride. Best -# part is that it has no negative effect on performance:-) +# part is that it has no negative effect on performance:-) # # Version 4.3 implements switch between compact and non-compact block # functions in AES_cbc_encrypt depending on how much data was asked @@ -578,7 +578,7 @@ sub enctransform() # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ # | mm4 | mm0 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ -# | s3 | s2 | s1 | s0 | +# | s3 | s2 | s1 | s0 | # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ @@ -798,7 +798,7 @@ sub encstep() if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] - else { &mov ($tmp,$s[3]); + else { &mov ($tmp,$s[3]); &shr ($tmp,24) } &xor ($out,&DWP(1,$te,$tmp,8)); if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } @@ -1551,7 +1551,7 @@ sub sse_deccompact() &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1); &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4 - &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16) + &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16) &pxor ("mm3","mm3"); &pxor ("mm7","mm7"); &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5"); @@ -2021,7 +2021,7 @@ sub declast() { # stack frame layout # -4(%esp) # return address 0(%esp) -# 0(%esp) # s0 backing store 4(%esp) +# 0(%esp) # s0 backing store 4(%esp) # 4(%esp) # s1 backing store 8(%esp) # 8(%esp) # s2 backing store 12(%esp) # 12(%esp) # s3 backing store 16(%esp) @@ -2731,7 +2731,7 @@ sub enckey() &mov (&DWP(80,"edi"),10); # setup number of rounds &xor ("eax","eax"); &jmp (&label("exit")); - + &set_label("12rounds"); &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords &mov ("ebx",&DWP(4,"esi")); diff --git a/src/crypto/aes/asm/aes-x86_64.pl b/src/crypto/aes/asm/aes-x86_64.pl index ed489af1..abf957cc 100644..100755 --- a/src/crypto/aes/asm/aes-x86_64.pl +++ b/src/crypto/aes/asm/aes-x86_64.pl @@ -590,6 +590,7 @@ $code.=<<___; .type asm_AES_encrypt,\@function,3 .hidden asm_AES_encrypt asm_AES_encrypt: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -598,7 +599,6 @@ asm_AES_encrypt: push %r15 # allocate frame "above" key schedule - mov %rsp,%r10 lea -63(%rdx),%rcx # %rdx is key argument and \$-64,%rsp sub %rsp,%rcx @@ -608,7 +608,7 @@ asm_AES_encrypt: sub \$32,%rsp mov %rsi,16(%rsp) # save out - mov %r10,24(%rsp) # save real stack pointer + mov %rax,24(%rsp) # save original stack pointer .Lenc_prologue: mov %rdx,$key @@ -640,13 +640,13 @@ asm_AES_encrypt: mov $s2,8($out) mov $s3,12($out) - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lenc_epilogue: ret .size asm_AES_encrypt,.-asm_AES_encrypt @@ -1186,6 +1186,7 @@ $code.=<<___; .type asm_AES_decrypt,\@function,3 .hidden asm_AES_decrypt asm_AES_decrypt: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -1194,7 +1195,6 @@ asm_AES_decrypt: push %r15 # allocate frame "above" key schedule - mov %rsp,%r10 lea -63(%rdx),%rcx # %rdx is key argument and \$-64,%rsp sub %rsp,%rcx @@ -1204,7 +1204,7 @@ asm_AES_decrypt: sub \$32,%rsp mov %rsi,16(%rsp) # save out - mov %r10,24(%rsp) # save real stack pointer + mov %rax,24(%rsp) # save original stack pointer .Ldec_prologue: mov %rdx,$key @@ -1238,13 +1238,13 @@ asm_AES_decrypt: mov $s2,8($out) mov $s3,12($out) - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Ldec_epilogue: ret .size asm_AES_decrypt,.-asm_AES_decrypt @@ -1286,7 +1286,7 @@ $code.=<<___; asm_AES_set_encrypt_key: push %rbx push %rbp - push %r12 # redundant, but allows to share + push %r12 # redundant, but allows to share push %r13 # exception handler... push %r14 push %r15 @@ -1412,7 +1412,7 @@ $code.=<<___; xor %rax,%rax jmp .Lexit -.L14rounds: +.L14rounds: mov 0(%rsi),%rax # copy first 8 dwords mov 8(%rsi),%rbx mov 16(%rsi),%rcx @@ -1660,10 +1660,9 @@ asm_AES_cbc_encrypt: mov %r9d,%r9d # clear upper half of enc lea .LAES_Te(%rip),$sbox + lea .LAES_Td(%rip),%r10 cmp \$0,%r9 - jne .Lcbc_picked_te - lea .LAES_Td(%rip),$sbox -.Lcbc_picked_te: + cmoveq %r10,$sbox mov OPENSSL_ia32cap_P(%rip),%r10d cmp \$$speed_limit,%rdx @@ -2565,7 +2564,6 @@ block_se_handler: jae .Lin_block_prologue mov 24(%rax),%rax # pull saved real stack pointer - lea 48(%rax),%rax # adjust... mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/src/crypto/aes/asm/aesni-x86.pl b/src/crypto/aes/asm/aesni-x86.pl index 4ef84bc2..e494dd16 100644 --- a/src/crypto/aes/asm/aesni-x86.pl +++ b/src/crypto/aes/asm/aesni-x86.pl @@ -51,7 +51,9 @@ # Westmere 3.77/1.37 1.37 1.52 1.27 # * Bridge 5.07/0.98 0.99 1.09 0.91 # Haswell 4.44/0.80 0.97 1.03 0.72 +# Skylake 2.68/0.65 0.65 0.66 0.64 # Silvermont 5.77/3.56 3.67 4.03 3.46 +# Goldmont 3.84/1.39 1.39 1.63 1.31 # Bulldozer 5.80/0.98 1.05 1.24 0.93 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script @@ -1040,7 +1042,7 @@ if ($PREFIX eq "aesni") { &set_label("ctr32_one_shortcut",16); &movups ($inout0,&QWP(0,$rounds_)); # load ivec &mov ($rounds,&DWP(240,$key)); - + &set_label("ctr32_one"); if ($inline) { &aesni_inline_generate1("enc"); } diff --git a/src/crypto/aes/asm/aesni-x86_64.pl b/src/crypto/aes/asm/aesni-x86_64.pl index 55d5f30a..8ae6dbfa 100644 --- a/src/crypto/aes/asm/aesni-x86_64.pl +++ b/src/crypto/aes/asm/aesni-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -27,7 +34,7 @@ # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 -# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 +# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 # @@ -111,7 +118,7 @@ # performance is achieved by interleaving instructions working on # independent blocks. In which case asymptotic limit for such modes # can be obtained by dividing above mentioned numbers by AES -# instructions' interleave factor. Westmere can execute at most 3 +# instructions' interleave factor. Westmere can execute at most 3 # instructions at a time, meaning that optimal interleave factor is 3, # and that's where the "magic" number of 1.25 come from. "Optimal # interleave factor" means that increase of interleave factor does @@ -157,16 +164,23 @@ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. +# November 2015 +# +# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was +# chosen to be 6x. + ###################################################################### # Current large-block performance in cycles per byte processed with # 128-bit key (less is better). # -# CBC en-/decrypt CTR XTS ECB +# CBC en-/decrypt CTR XTS ECB OCB # Westmere 3.77/1.25 1.25 1.25 1.26 -# * Bridge 5.07/0.74 0.75 0.90 0.85 -# Haswell 4.44/0.63 0.63 0.73 0.63 -# Silvermont 5.75/3.54 3.56 4.12 3.87(*) -# Bulldozer 5.77/0.70 0.72 0.90 0.70 +# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 +# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 +# Skylake 2.62/0.63 0.63 0.63 0.63 +# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 +# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 +# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 # # (*) Atom Silvermont ECB result is suboptimal because of penalties # incurred by operations on %xmm8-15. As ECB is not considered @@ -299,7 +313,7 @@ ___ # on 2x subroutine on Atom Silvermont account. For processors that # can schedule aes[enc|dec] every cycle optimal interleave factor # equals to corresponding instructions latency. 8x is optimal for -# * Bridge and "super-optimal" for other Intel CPUs... +# * Bridge and "super-optimal" for other Intel CPUs... sub aesni_generate2 { my $dir=shift; @@ -1158,7 +1172,7 @@ ___ # with zero-round key xor. { my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); -my ($key0,$ctr)=("${key_}d","${ivp}d"); +my ($key0,$ctr)=("%ebp","${ivp}d"); my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; @@ -1187,26 +1201,25 @@ $code.=<<___; .align 16 .Lctr32_bulk: - lea (%rsp),%rax + lea (%rsp),$key_ # use $key_ as frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8($key_) # offload everything + movaps %xmm7,-0x98($key_) + movaps %xmm8,-0x88($key_) + movaps %xmm9,-0x78($key_) + movaps %xmm10,-0x68($key_) + movaps %xmm11,-0x58($key_) + movaps %xmm12,-0x48($key_) + movaps %xmm13,-0x38($key_) + movaps %xmm14,-0x28($key_) + movaps %xmm15,-0x18($key_) .Lctr32_body: ___ $code.=<<___; - lea -8(%rax),%rbp # 8 16-byte words on top of stack are counter values # xor-ed with zero-round key @@ -1258,7 +1271,7 @@ $code.=<<___; lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d - mov OPENSSL_ia32cap_P+4(%rip),%r10d + mov OPENSSL_ia32cap_P+4(%rip),%r10d xor $key0,%r9d and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE mov %r9d,0x70+12(%rsp) @@ -1538,7 +1551,7 @@ $code.=<<___; .Lctr32_tail: # note that at this point $inout0..5 are populated with - # counter values xor-ed with 0-round key + # counter values xor-ed with 0-round key lea 16($key),$key cmp \$4,$len jb .Lctr32_loop3 @@ -1678,26 +1691,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8($key_),%xmm6 + movaps %xmm0,-0xa8($key_) # clear stack + movaps -0x98($key_),%xmm7 + movaps %xmm0,-0x98($key_) + movaps -0x88($key_),%xmm8 + movaps %xmm0,-0x88($key_) + movaps -0x78($key_),%xmm9 + movaps %xmm0,-0x78($key_) + movaps -0x68($key_),%xmm10 + movaps %xmm0,-0x68($key_) + movaps -0x58($key_),%xmm11 + movaps %xmm0,-0x58($key_) + movaps -0x48($key_),%xmm12 + movaps %xmm0,-0x48($key_) + movaps -0x38($key_),%xmm13 + movaps %xmm0,-0x38($key_) + movaps -0x28($key_),%xmm14 + movaps %xmm0,-0x28($key_) + movaps -0x18($key_),%xmm15 + movaps %xmm0,-0x18($key_) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -1708,8 +1721,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x70(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8($key_),%rbp + lea ($key_),%rsp .Lctr32_epilogue: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks @@ -1726,32 +1739,32 @@ my @tweak=map("%xmm$_",(10..15)); my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); my $frame_size = 0x70 + ($win64?160:0); +my $key_ = "%rbp"; # override so that we can use %r11 as FP $code.=<<___; .globl aesni_xts_encrypt .type aesni_xts_encrypt,\@function,6 .align 16 aesni_xts_encrypt: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8(%r11) # offload everything + movaps %xmm7,-0x98(%r11) + movaps %xmm8,-0x88(%r11) + movaps %xmm9,-0x78(%r11) + movaps %xmm10,-0x68(%r11) + movaps %xmm11,-0x58(%r11) + movaps %xmm12,-0x48(%r11) + movaps %xmm13,-0x38(%r11) + movaps %xmm14,-0x28(%r11) + movaps %xmm15,-0x18(%r11) .Lxts_enc_body: ___ $code.=<<___; - lea -8(%rax),%rbp movups ($ivp),$inout0 # load clear-text tweak mov 240(%r8),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds @@ -2169,26 +2182,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8(%r11),%xmm6 + movaps %xmm0,-0xa8(%r11) # clear stack + movaps -0x98(%r11),%xmm7 + movaps %xmm0,-0x98(%r11) + movaps -0x88(%r11),%xmm8 + movaps %xmm0,-0x88(%r11) + movaps -0x78(%r11),%xmm9 + movaps %xmm0,-0x78(%r11) + movaps -0x68(%r11),%xmm10 + movaps %xmm0,-0x68(%r11) + movaps -0x58(%r11),%xmm11 + movaps %xmm0,-0x58(%r11) + movaps -0x48(%r11),%xmm12 + movaps %xmm0,-0x48(%r11) + movaps -0x38(%r11),%xmm13 + movaps %xmm0,-0x38(%r11) + movaps -0x28(%r11),%xmm14 + movaps %xmm0,-0x28(%r11) + movaps -0x18(%r11),%xmm15 + movaps %xmm0,-0x18(%r11) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -2198,8 +2211,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x60(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lxts_enc_epilogue: ret .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -2210,26 +2223,25 @@ $code.=<<___; .type aesni_xts_decrypt,\@function,6 .align 16 aesni_xts_decrypt: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8(%r11) # offload everything + movaps %xmm7,-0x98(%r11) + movaps %xmm8,-0x88(%r11) + movaps %xmm9,-0x78(%r11) + movaps %xmm10,-0x68(%r11) + movaps %xmm11,-0x58(%r11) + movaps %xmm12,-0x48(%r11) + movaps %xmm13,-0x38(%r11) + movaps %xmm14,-0x28(%r11) + movaps %xmm15,-0x18(%r11) .Lxts_dec_body: ___ $code.=<<___; - lea -8(%rax),%rbp movups ($ivp),$inout0 # load clear-text tweak mov 240($key2),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds @@ -2673,26 +2685,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8(%r11),%xmm6 + movaps %xmm0,-0xa8(%r11) # clear stack + movaps -0x98(%r11),%xmm7 + movaps %xmm0,-0x98(%r11) + movaps -0x88(%r11),%xmm8 + movaps %xmm0,-0x88(%r11) + movaps -0x78(%r11),%xmm9 + movaps %xmm0,-0x78(%r11) + movaps -0x68(%r11),%xmm10 + movaps %xmm0,-0x68(%r11) + movaps -0x58(%r11),%xmm11 + movaps %xmm0,-0x58(%r11) + movaps -0x48(%r11),%xmm12 + movaps %xmm0,-0x48(%r11) + movaps -0x38(%r11),%xmm13 + movaps %xmm0,-0x38(%r11) + movaps -0x28(%r11),%xmm14 + movaps %xmm0,-0x28(%r11) + movaps -0x18(%r11),%xmm15 + movaps %xmm0,-0x18(%r11) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -2702,12 +2714,933 @@ $code.=<<___ if ($win64); movaps %xmm0,0x60(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lxts_dec_epilogue: ret .size aesni_xts_decrypt,.-aesni_xts_decrypt ___ +} + +###################################################################### +# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, +# const AES_KEY *key, unsigned int start_block_num, +# unsigned char offset_i[16], const unsigned char L_[][16], +# unsigned char checksum[16]); +# +{ +my @offset=map("%xmm$_",(10..15)); +my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); +my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments +my ($L_p,$checksum_p) = ("%rbx","%rbp"); +my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); +my $seventh_arg = $win64 ? 56 : 8; +my $blocks = $len; + +$code.=<<___; +.globl aesni_ocb_encrypt +.type aesni_ocb_encrypt,\@function,6 +.align 32 +aesni_ocb_encrypt: + lea (%rsp),%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp),%rsp + movaps %xmm6,0x00(%rsp) # offload everything + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,0x70(%rsp) + movaps %xmm14,0x80(%rsp) + movaps %xmm15,0x90(%rsp) +.Locb_enc_body: +___ +$code.=<<___; + mov $seventh_arg(%rax),$L_p # 7th argument + mov $seventh_arg+8(%rax),$checksum_p# 8th argument + + mov 240($key),$rnds_ + mov $key,$key_ + shl \$4,$rnds_ + $movkey ($key),$rndkey0l # round[0] + $movkey 16($key,$rnds_),$rndkey1 # round[last] + + movdqu ($offset_p),@offset[5] # load last offset_i + pxor $rndkey1,$rndkey0l # round[0] ^ round[last] + pxor $rndkey1,@offset[5] # offset_i ^ round[last] + + mov \$16+32,$rounds + lea 32($key_,$rnds_),$key + $movkey 16($key_),$rndkey1 # round[1] + sub %r10,%rax # twisted $rounds + mov %rax,%r10 # backup twisted $rounds + + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + movdqu ($checksum_p),$checksum # load checksum + + test \$1,$block_num # is first block number odd? + jnz .Locb_enc_odd + + bsf $block_num,$i1 + add \$1,$block_num + shl \$4,$i1 + movdqu ($L_p,$i1),$inout5 # borrow + movdqu ($inp),$inout0 + lea 16($inp),$inp + + call __ocb_encrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,($out) + lea 16($out),$out + sub \$1,$blocks + jz .Locb_enc_done + +.Locb_enc_odd: + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + lea 6($block_num),$block_num + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + shl \$4,$i5 + + sub \$6,$blocks + jc .Locb_enc_short + jmp .Locb_enc_grandloop + +.align 32 +.Locb_enc_grandloop: + movdqu `16*0`($inp),$inout0 # load input + movdqu `16*1`($inp),$inout1 + movdqu `16*2`($inp),$inout2 + movdqu `16*3`($inp),$inout3 + movdqu `16*4`($inp),$inout4 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + + call __ocb_encrypt6 + + movups $inout0,`16*0`($out) # store output + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + movups $inout3,`16*3`($out) + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$6,$blocks + jnc .Locb_enc_grandloop + +.Locb_enc_short: + add \$6,$blocks + jz .Locb_enc_done + + movdqu `16*0`($inp),$inout0 + cmp \$2,$blocks + jb .Locb_enc_one + movdqu `16*1`($inp),$inout1 + je .Locb_enc_two + + movdqu `16*2`($inp),$inout2 + cmp \$4,$blocks + jb .Locb_enc_three + movdqu `16*3`($inp),$inout3 + je .Locb_enc_four + + movdqu `16*4`($inp),$inout4 + pxor $inout5,$inout5 + + call __ocb_encrypt6 + + movdqa @offset[4],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + movups $inout3,`16*3`($out) + movups $inout4,`16*4`($out) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_one: + movdqa @offset[0],$inout5 # borrow + + call __ocb_encrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,`16*0`($out) + jmp .Locb_enc_done + +.align 16 +.Locb_enc_two: + pxor $inout2,$inout2 + pxor $inout3,$inout3 + + call __ocb_encrypt4 + + movdqa @offset[1],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_three: + pxor $inout3,$inout3 + + call __ocb_encrypt4 + + movdqa @offset[2],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_four: + call __ocb_encrypt4 + + movdqa @offset[3],@offset[5] + movups $inout0,`16*0`($out) + movups $inout1,`16*1`($out) + movups $inout2,`16*2`($out) + movups $inout3,`16*3`($out) + +.Locb_enc_done: + pxor $rndkey0,@offset[5] # "remove" round[last] + movdqu $checksum,($checksum_p) # store checksum + movdqu @offset[5],($offset_p) # store last offset_i + + xorps %xmm0,%xmm0 # clear register bank + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + lea 0x28(%rsp),%rax +___ +$code.=<<___ if ($win64); + movaps 0x00(%rsp),%xmm6 + movaps %xmm0,0x00(%rsp) # clear stack + movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) + movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) + movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) + movaps 0x40(%rsp),%xmm10 + movaps %xmm0,0x40(%rsp) + movaps 0x50(%rsp),%xmm11 + movaps %xmm0,0x50(%rsp) + movaps 0x60(%rsp),%xmm12 + movaps %xmm0,0x60(%rsp) + movaps 0x70(%rsp),%xmm13 + movaps %xmm0,0x70(%rsp) + movaps 0x80(%rsp),%xmm14 + movaps %xmm0,0x80(%rsp) + movaps 0x90(%rsp),%xmm15 + movaps %xmm0,0x90(%rsp) + lea 0xa0+0x28(%rsp),%rax +.Locb_enc_pop: +___ +$code.=<<___; + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Locb_enc_epilogue: + ret +.size aesni_ocb_encrypt,.-aesni_ocb_encrypt + +.type __ocb_encrypt6,\@abi-omnipotent +.align 32 +__ocb_encrypt6: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + movdqa @offset[0],@offset[4] + pxor @offset[5],@offset[0] + movdqu ($L_p,$i5),@offset[5] + pxor @offset[0],@offset[1] + pxor $inout0,$checksum # accumulate checksum + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor $inout1,$checksum + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor $inout2,$checksum + pxor @offset[2],$inout2 + pxor @offset[3],@offset[4] + pxor $inout3,$checksum + pxor @offset[3],$inout3 + pxor @offset[4],@offset[5] + pxor $inout4,$checksum + pxor @offset[4],$inout4 + pxor $inout5,$checksum + pxor @offset[5],$inout5 + $movkey 32($key_),$rndkey0 + + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + add \$6,$block_num + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + aesenc $rndkey1,$inout4 + pxor $rndkey0l,@offset[3] + pxor $rndkey0l,@offset[4] + aesenc $rndkey1,$inout5 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,@offset[5] + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + jmp .Locb_enc_loop6 + +.align 32 +.Locb_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_enc_loop6 + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + shl \$4,$i5 + + aesenclast @offset[0],$inout0 + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + mov %r10,%rax # restore twisted rounds + aesenclast @offset[1],$inout1 + aesenclast @offset[2],$inout2 + aesenclast @offset[3],$inout3 + aesenclast @offset[4],$inout4 + aesenclast @offset[5],$inout5 + ret +.size __ocb_encrypt6,.-__ocb_encrypt6 + +.type __ocb_encrypt4,\@abi-omnipotent +.align 32 +__ocb_encrypt4: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + pxor @offset[5],@offset[0] + pxor @offset[0],@offset[1] + pxor $inout0,$checksum # accumulate checksum + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor $inout1,$checksum + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor $inout2,$checksum + pxor @offset[2],$inout2 + pxor $inout3,$checksum + pxor @offset[3],$inout3 + $movkey 32($key_),$rndkey0 + + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + pxor $rndkey0l,@offset[3] + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey 48($key_),$rndkey1 + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + $movkey 64($key_),$rndkey0 + jmp .Locb_enc_loop4 + +.align 32 +.Locb_enc_loop4: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_enc_loop4 + + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + $movkey 16($key_),$rndkey1 + mov %r10,%rax # restore twisted rounds + + aesenclast @offset[0],$inout0 + aesenclast @offset[1],$inout1 + aesenclast @offset[2],$inout2 + aesenclast @offset[3],$inout3 + ret +.size __ocb_encrypt4,.-__ocb_encrypt4 + +.type __ocb_encrypt1,\@abi-omnipotent +.align 32 +__ocb_encrypt1: + pxor @offset[5],$inout5 # offset_i + pxor $rndkey0l,$inout5 # offset_i ^ round[0] + pxor $inout0,$checksum # accumulate checksum + pxor $inout5,$inout0 # input ^ round[0] ^ offset_i + $movkey 32($key_),$rndkey0 + + aesenc $rndkey1,$inout0 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,$inout5 # offset_i ^ round[last] + + aesenc $rndkey0,$inout0 + $movkey 64($key_),$rndkey0 + jmp .Locb_enc_loop1 + +.align 32 +.Locb_enc_loop1: + aesenc $rndkey1,$inout0 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesenc $rndkey0,$inout0 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_enc_loop1 + + aesenc $rndkey1,$inout0 + $movkey 16($key_),$rndkey1 # redundant in tail + mov %r10,%rax # restore twisted rounds + + aesenclast $inout5,$inout0 + ret +.size __ocb_encrypt1,.-__ocb_encrypt1 + +.globl aesni_ocb_decrypt +.type aesni_ocb_decrypt,\@function,6 +.align 32 +aesni_ocb_decrypt: + lea (%rsp),%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp),%rsp + movaps %xmm6,0x00(%rsp) # offload everything + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) + movaps %xmm13,0x70(%rsp) + movaps %xmm14,0x80(%rsp) + movaps %xmm15,0x90(%rsp) +.Locb_dec_body: +___ +$code.=<<___; + mov $seventh_arg(%rax),$L_p # 7th argument + mov $seventh_arg+8(%rax),$checksum_p# 8th argument + + mov 240($key),$rnds_ + mov $key,$key_ + shl \$4,$rnds_ + $movkey ($key),$rndkey0l # round[0] + $movkey 16($key,$rnds_),$rndkey1 # round[last] + + movdqu ($offset_p),@offset[5] # load last offset_i + pxor $rndkey1,$rndkey0l # round[0] ^ round[last] + pxor $rndkey1,@offset[5] # offset_i ^ round[last] + + mov \$16+32,$rounds + lea 32($key_,$rnds_),$key + $movkey 16($key_),$rndkey1 # round[1] + sub %r10,%rax # twisted $rounds + mov %rax,%r10 # backup twisted $rounds + + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + movdqu ($checksum_p),$checksum # load checksum + + test \$1,$block_num # is first block number odd? + jnz .Locb_dec_odd + + bsf $block_num,$i1 + add \$1,$block_num + shl \$4,$i1 + movdqu ($L_p,$i1),$inout5 # borrow + movdqu ($inp),$inout0 + lea 16($inp),$inp + + call __ocb_decrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,($out) + xorps $inout0,$checksum # accumulate checksum + lea 16($out),$out + sub \$1,$blocks + jz .Locb_dec_done + +.Locb_dec_odd: + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + lea 6($block_num),$block_num + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + shl \$4,$i5 + + sub \$6,$blocks + jc .Locb_dec_short + jmp .Locb_dec_grandloop + +.align 32 +.Locb_dec_grandloop: + movdqu `16*0`($inp),$inout0 # load input + movdqu `16*1`($inp),$inout1 + movdqu `16*2`($inp),$inout2 + movdqu `16*3`($inp),$inout3 + movdqu `16*4`($inp),$inout4 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + + call __ocb_decrypt6 + + movups $inout0,`16*0`($out) # store output + pxor $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + pxor $inout1,$checksum + movups $inout2,`16*2`($out) + pxor $inout2,$checksum + movups $inout3,`16*3`($out) + pxor $inout3,$checksum + movups $inout4,`16*4`($out) + pxor $inout4,$checksum + movups $inout5,`16*5`($out) + pxor $inout5,$checksum + lea `16*6`($out),$out + sub \$6,$blocks + jnc .Locb_dec_grandloop + +.Locb_dec_short: + add \$6,$blocks + jz .Locb_dec_done + + movdqu `16*0`($inp),$inout0 + cmp \$2,$blocks + jb .Locb_dec_one + movdqu `16*1`($inp),$inout1 + je .Locb_dec_two + + movdqu `16*2`($inp),$inout2 + cmp \$4,$blocks + jb .Locb_dec_three + movdqu `16*3`($inp),$inout3 + je .Locb_dec_four + + movdqu `16*4`($inp),$inout4 + pxor $inout5,$inout5 + + call __ocb_decrypt6 + + movdqa @offset[4],@offset[5] + movups $inout0,`16*0`($out) # store output + pxor $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + pxor $inout1,$checksum + movups $inout2,`16*2`($out) + pxor $inout2,$checksum + movups $inout3,`16*3`($out) + pxor $inout3,$checksum + movups $inout4,`16*4`($out) + pxor $inout4,$checksum + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_one: + movdqa @offset[0],$inout5 # borrow + + call __ocb_decrypt1 + + movdqa $inout5,@offset[5] + movups $inout0,`16*0`($out) # store output + xorps $inout0,$checksum # accumulate checksum + jmp .Locb_dec_done + +.align 16 +.Locb_dec_two: + pxor $inout2,$inout2 + pxor $inout3,$inout3 + + call __ocb_decrypt4 + + movdqa @offset[1],@offset[5] + movups $inout0,`16*0`($out) # store output + xorps $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + xorps $inout1,$checksum + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_three: + pxor $inout3,$inout3 + + call __ocb_decrypt4 + + movdqa @offset[2],@offset[5] + movups $inout0,`16*0`($out) # store output + xorps $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + xorps $inout1,$checksum + movups $inout2,`16*2`($out) + xorps $inout2,$checksum + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_four: + call __ocb_decrypt4 + + movdqa @offset[3],@offset[5] + movups $inout0,`16*0`($out) # store output + pxor $inout0,$checksum # accumulate checksum + movups $inout1,`16*1`($out) + pxor $inout1,$checksum + movups $inout2,`16*2`($out) + pxor $inout2,$checksum + movups $inout3,`16*3`($out) + pxor $inout3,$checksum + +.Locb_dec_done: + pxor $rndkey0,@offset[5] # "remove" round[last] + movdqu $checksum,($checksum_p) # store checksum + movdqu @offset[5],($offset_p) # store last offset_i + + xorps %xmm0,%xmm0 # clear register bank + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +$code.=<<___ if (!$win64); + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + lea 0x28(%rsp),%rax +___ +$code.=<<___ if ($win64); + movaps 0x00(%rsp),%xmm6 + movaps %xmm0,0x00(%rsp) # clear stack + movaps 0x10(%rsp),%xmm7 + movaps %xmm0,0x10(%rsp) + movaps 0x20(%rsp),%xmm8 + movaps %xmm0,0x20(%rsp) + movaps 0x30(%rsp),%xmm9 + movaps %xmm0,0x30(%rsp) + movaps 0x40(%rsp),%xmm10 + movaps %xmm0,0x40(%rsp) + movaps 0x50(%rsp),%xmm11 + movaps %xmm0,0x50(%rsp) + movaps 0x60(%rsp),%xmm12 + movaps %xmm0,0x60(%rsp) + movaps 0x70(%rsp),%xmm13 + movaps %xmm0,0x70(%rsp) + movaps 0x80(%rsp),%xmm14 + movaps %xmm0,0x80(%rsp) + movaps 0x90(%rsp),%xmm15 + movaps %xmm0,0x90(%rsp) + lea 0xa0+0x28(%rsp),%rax +.Locb_dec_pop: +___ +$code.=<<___; + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp +.Locb_dec_epilogue: + ret +.size aesni_ocb_decrypt,.-aesni_ocb_decrypt + +.type __ocb_decrypt6,\@abi-omnipotent +.align 32 +__ocb_decrypt6: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + movdqa @offset[0],@offset[4] + pxor @offset[5],@offset[0] + movdqu ($L_p,$i5),@offset[5] + pxor @offset[0],@offset[1] + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor @offset[2],$inout2 + pxor @offset[3],@offset[4] + pxor @offset[3],$inout3 + pxor @offset[4],@offset[5] + pxor @offset[4],$inout4 + pxor @offset[5],$inout5 + $movkey 32($key_),$rndkey0 + + lea 1($block_num),$i1 # even-numbered blocks + lea 3($block_num),$i3 + lea 5($block_num),$i5 + add \$6,$block_num + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + bsf $i1,$i1 # ntz(block) + bsf $i3,$i3 + bsf $i5,$i5 + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + aesdec $rndkey1,$inout4 + pxor $rndkey0l,@offset[3] + pxor $rndkey0l,@offset[4] + aesdec $rndkey1,$inout5 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,@offset[5] + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + shl \$4,$i1 # ntz(block) -> table offset + shl \$4,$i3 + jmp .Locb_dec_loop6 + +.align 32 +.Locb_dec_loop6: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_dec_loop6 + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + shl \$4,$i5 + + aesdeclast @offset[0],$inout0 + movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks + mov %r10,%rax # restore twisted rounds + aesdeclast @offset[1],$inout1 + aesdeclast @offset[2],$inout2 + aesdeclast @offset[3],$inout3 + aesdeclast @offset[4],$inout4 + aesdeclast @offset[5],$inout5 + ret +.size __ocb_decrypt6,.-__ocb_decrypt6 + +.type __ocb_decrypt4,\@abi-omnipotent +.align 32 +__ocb_decrypt4: + pxor $rndkey0l,@offset[5] # offset_i ^ round[0] + movdqu ($L_p,$i1),@offset[1] + movdqa @offset[0],@offset[2] + movdqu ($L_p,$i3),@offset[3] + pxor @offset[5],@offset[0] + pxor @offset[0],@offset[1] + pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i + pxor @offset[1],@offset[2] + pxor @offset[1],$inout1 + pxor @offset[2],@offset[3] + pxor @offset[2],$inout2 + pxor @offset[3],$inout3 + $movkey 32($key_),$rndkey0 + + pxor $rndkey0l,@offset[0] # offset_i ^ round[last] + pxor $rndkey0l,@offset[1] + pxor $rndkey0l,@offset[2] + pxor $rndkey0l,@offset[3] + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + $movkey 48($key_),$rndkey1 + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + $movkey 64($key_),$rndkey0 + jmp .Locb_dec_loop4 + +.align 32 +.Locb_dec_loop4: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_dec_loop4 + + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + $movkey 16($key_),$rndkey1 + mov %r10,%rax # restore twisted rounds + + aesdeclast @offset[0],$inout0 + aesdeclast @offset[1],$inout1 + aesdeclast @offset[2],$inout2 + aesdeclast @offset[3],$inout3 + ret +.size __ocb_decrypt4,.-__ocb_decrypt4 + +.type __ocb_decrypt1,\@abi-omnipotent +.align 32 +__ocb_decrypt1: + pxor @offset[5],$inout5 # offset_i + pxor $rndkey0l,$inout5 # offset_i ^ round[0] + pxor $inout5,$inout0 # input ^ round[0] ^ offset_i + $movkey 32($key_),$rndkey0 + + aesdec $rndkey1,$inout0 + $movkey 48($key_),$rndkey1 + pxor $rndkey0l,$inout5 # offset_i ^ round[last] + + aesdec $rndkey0,$inout0 + $movkey 64($key_),$rndkey0 + jmp .Locb_dec_loop1 + +.align 32 +.Locb_dec_loop1: + aesdec $rndkey1,$inout0 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + + aesdec $rndkey0,$inout0 + $movkey -16($key,%rax),$rndkey0 + jnz .Locb_dec_loop1 + + aesdec $rndkey1,$inout0 + $movkey 16($key_),$rndkey1 # redundant in tail + mov %r10,%rax # restore twisted rounds + + aesdeclast $inout5,$inout0 + ret +.size __ocb_decrypt1,.-__ocb_decrypt1 +___ } }} ######################################################################## @@ -2717,7 +3650,6 @@ ___ { my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); -my $inp_=$key_; $code.=<<___; .globl ${PREFIX}_cbc_encrypt @@ -2799,7 +3731,7 @@ $code.=<<___; jmp .Lcbc_ret .align 16 .Lcbc_decrypt_bulk: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded @@ -2817,8 +3749,11 @@ $code.=<<___ if ($win64); movaps %xmm15,0xa0(%rsp) .Lcbc_decrypt_body: ___ + +my $inp_=$key_="%rbp"; # reassign $key_ + $code.=<<___; - lea -8(%rax),%rbp + mov $key,$key_ # [re-]backup $key [after reassignment] movups ($ivp),$iv mov $rnds_,$rounds cmp \$0x50,$len @@ -2858,7 +3793,7 @@ $code.=<<___; pxor $rndkey0,$inout1 $movkey 0x10-0x70($key),$rndkey1 pxor $rndkey0,$inout2 - xor $inp_,$inp_ + mov \$-1,$inp_ cmp \$0x70,$len # is there at least 0x60 bytes ahead? pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 @@ -2874,8 +3809,8 @@ $code.=<<___; aesdec $rndkey1,$inout4 aesdec $rndkey1,$inout5 aesdec $rndkey1,$inout6 - setnc ${inp_}b - shl \$7,$inp_ + adc \$0,$inp_ + and \$128,$inp_ aesdec $rndkey1,$inout7 add $inp,$inp_ $movkey 0x30-0x70($key),$rndkey1 @@ -3239,8 +4174,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0xa0(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lcbc_ret: ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt @@ -3307,7 +4242,7 @@ ___ # Vinodh Gopal <vinodh.gopal@intel.com> # Kahraman Akdemir # -# Agressively optimized in respect to aeskeygenassist's critical path +# Aggressively optimized in respect to aeskeygenassist's critical path # and is contained in %xmm0-5 to meet Win64 ABI requirement. # # int ${PREFIX}_set_encrypt_key(const unsigned char *inp, @@ -3811,14 +4746,76 @@ ctr_xts_se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 160($context),%rax # pull context->Rbp - lea -0xa0(%rax),%rsi # %xmm save area + mov 208($context),%rax # pull context->R11 + + lea -0xa8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - jmp .Lcommon_rbp_tail + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp + jmp .Lcommon_seh_tail .size ctr_xts_se_handler,.-ctr_xts_se_handler + +.type ocb_se_handler,\@abi-omnipotent +.align 16 +ocb_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue lable + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 + cmp %r10,%rbx # context->Rip>=pop label + jae .Locb_no_xmm + + mov 152($context),%rax # pull context->Rsp + + lea (%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xa0+0x28(%rax),%rax + +.Locb_no_xmm: + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size ocb_se_handler,.-ocb_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent @@ -3842,9 +4839,13 @@ cbc_se_handler: cmp %r10,%rbx # context->Rip<"prologue" label jb .Lcommon_seh_tail + mov 120($context),%rax # pull context->Rax + lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->Rip<cbc_decrypt_body - jb .Lrestore_cbc_rax + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label @@ -3855,15 +4856,10 @@ cbc_se_handler: mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq -.Lcommon_rbp_tail: - mov 160($context),%rax # pull context->Rbp - mov (%rax),%rbp # restore saved %rbp - lea 8(%rax),%rax # adjust stack pointer - mov %rbp,160($context) # restore context->Rbp - jmp .Lcommon_seh_tail + mov 208($context),%rax # pull context->R11 -.Lrestore_cbc_rax: - mov 120($context),%rax + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -3932,6 +4928,14 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .LSEH_begin_aesni_xts_decrypt .rva .LSEH_end_aesni_xts_decrypt .rva .LSEH_info_xts_dec + + .rva .LSEH_begin_aesni_ocb_encrypt + .rva .LSEH_end_aesni_ocb_encrypt + .rva .LSEH_info_ocb_enc + + .rva .LSEH_begin_aesni_ocb_decrypt + .rva .LSEH_end_aesni_ocb_decrypt + .rva .LSEH_info_ocb_dec ___ $code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt @@ -3973,6 +4977,18 @@ $code.=<<___ if ($PREFIX eq "aesni"); .byte 9,0,0,0 .rva ctr_xts_se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] +.LSEH_info_ocb_enc: + .byte 9,0,0,0 + .rva ocb_se_handler + .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] + .rva .Locb_enc_pop + .long 0 +.LSEH_info_ocb_dec: + .byte 9,0,0,0 + .rva ocb_se_handler + .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] + .rva .Locb_dec_pop + .long 0 ___ $code.=<<___; .LSEH_info_cbc: diff --git a/src/crypto/aes/asm/aesv8-armx.pl b/src/crypto/aes/asm/aesv8-armx.pl index f6d0dabd..23ed77c1 100644 --- a/src/crypto/aes/asm/aesv8-armx.pl +++ b/src/crypto/aes/asm/aesv8-armx.pl @@ -957,21 +957,21 @@ if ($flavour =~ /64/) { ######## 64-bit code $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && sprintf "vtbl.8 d%d,{q%d},d%d\n\t". - "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; } sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && - sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; } sub unvmov32 { my $arg=shift; $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && - sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; } foreach(split("\n",$code)) { diff --git a/src/crypto/aes/asm/bsaes-armv7.pl b/src/crypto/aes/asm/bsaes-armv7.pl index 37613e2c..d645de4c 100644 --- a/src/crypto/aes/asm/bsaes-armv7.pl +++ b/src/crypto/aes/asm/bsaes-armv7.pl @@ -84,7 +84,7 @@ my @s=@_[12..15]; sub InBasisChange { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb my @b=@_[0..7]; $code.=<<___; veor @b[2], @b[2], @b[1] diff --git a/src/crypto/aes/asm/bsaes-x86_64.pl b/src/crypto/aes/asm/bsaes-x86_64.pl index 8258f2f4..9a8055ef 100644 --- a/src/crypto/aes/asm/bsaes-x86_64.pl +++ b/src/crypto/aes/asm/bsaes-x86_64.pl @@ -41,6 +41,7 @@ # Nehalem(**) 7.63 6.88 +11% # Atom 17.1 16.4 +4% # Silvermont - 12.9 +# Goldmont - 8.85 # # (*) Comparison is not completely fair, because "this" is ECB, # i.e. no extra processing such as counter values calculation @@ -80,6 +81,7 @@ # Nehalem 7.80 # Atom 17.9 # Silvermont 14.0 +# Goldmont 10.2 # # November 2011. # @@ -122,7 +124,7 @@ my @s=@_[12..15]; sub InBasisChange { # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb my @b=@_[0..7]; $code.=<<___; pxor @b[6], @b[5] @@ -372,7 +374,7 @@ $code.=<<___; pxor @s[0], @t[3] pxor @s[1], @t[2] pxor @s[2], @t[1] - pxor @s[3], @t[0] + pxor @s[3], @t[0] #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 @@ -1325,7 +1327,7 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1338,17 +1340,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lecb_enc_epilogue: ret .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks @@ -1527,7 +1529,7 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1540,17 +1542,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lecb_dec_epilogue: ret .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks @@ -1817,7 +1819,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lcbc_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1830,17 +1832,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lcbc_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lcbc_dec_epilogue: ret .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt @@ -2049,7 +2051,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lctr_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2062,17 +2064,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lctr_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lctr_enc_epilogue: ret .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks @@ -2439,7 +2441,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lxts_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2452,17 +2454,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lxts_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lxts_enc_epilogue: ret .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -2846,7 +2848,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lxts_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2859,17 +2861,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lxts_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lxts_dec_epilogue: ret .size bsaes_xts_decrypt,.-bsaes_xts_decrypt @@ -2965,31 +2967,34 @@ se_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lin_prologue - - mov 152($context),%rax # pull context->Rsp + cmp %r10,%rbx # context->Rip<=prologue label + jbe .Lin_prologue mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=tail label + jae .Lin_tail + mov 160($context),%rax # pull context->Rbp lea 0x40(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0xa0(%rax),%rax # adjust stack pointer - - mov 0x70(%rax),%rbp - mov 0x68(%rax),%rbx - mov 0x60(%rax),%r12 - mov 0x58(%rax),%r13 - mov 0x50(%rax),%r14 - mov 0x48(%rax),%r15 - lea 0x78(%rax),%rax # adjust stack pointer + lea 0xa0+0x78(%rax),%rax # adjust stack pointer + +.Lin_tail: + mov -48(%rax),%rbp + mov -40(%rax),%rbx + mov -32(%rax),%r12 + mov -24(%rax),%r13 + mov -16(%rax),%r14 + mov -8(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 @@ -3070,28 +3075,40 @@ $code.=<<___ if ($ecb); .byte 9,0,0,0 .rva se_handler .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] + .rva .Lecb_enc_tail + .long 0 .Lecb_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] + .rva .Lecb_dec_tail + .long 0 ___ $code.=<<___; .Lcbc_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] + .rva .Lcbc_dec_tail + .long 0 .Lctr_enc_info: .byte 9,0,0,0 .rva se_handler .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] + .rva .Lctr_enc_tail + .long 0 .Lxts_enc_info: .byte 9,0,0,0 .rva se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] + .rva .Lxts_enc_tail + .long 0 .Lxts_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] + .rva .Lxts_dec_tail + .long 0 ___ } diff --git a/src/crypto/aes/asm/vpaes-x86.pl b/src/crypto/aes/asm/vpaes-x86.pl index 4fcd5615..ebf90e7e 100644 --- a/src/crypto/aes/asm/vpaes-x86.pl +++ b/src/crypto/aes/asm/vpaes-x86.pl @@ -438,7 +438,7 @@ $k_dsbo=0x2c0; # decryption sbox final output ## &set_label("schedule_192",16); &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) - &call ("_vpaes_schedule_transform"); # input transform + &call ("_vpaes_schedule_transform"); # input transform &movdqa ("xmm6","xmm0"); # save short part &pxor ("xmm4","xmm4"); # clear 4 &movhlps("xmm6","xmm4"); # clobber low side with zeros @@ -469,7 +469,7 @@ $k_dsbo=0x2c0; # decryption sbox final output ## &set_label("schedule_256",16); &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) - &call ("_vpaes_schedule_transform"); # input transform + &call ("_vpaes_schedule_transform"); # input transform &mov ($round,7); &set_label("loop_schedule_256"); @@ -480,7 +480,7 @@ $k_dsbo=0x2c0; # decryption sbox final output &call ("_vpaes_schedule_round"); &dec ($round); &jz (&label("schedule_mangle_last")); - &call ("_vpaes_schedule_mangle"); + &call ("_vpaes_schedule_mangle"); # low round. swap xmm7 and xmm6 &pshufd ("xmm0","xmm0",0xFF); @@ -603,7 +603,7 @@ $k_dsbo=0x2c0; # decryption sbox final output # subbyte &movdqa ("xmm4",&QWP($k_s0F,$const)); &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j - &movdqa ("xmm1","xmm4"); + &movdqa ("xmm1","xmm4"); &pandn ("xmm1","xmm0"); &psrld ("xmm1",4); # 1 = i &pand ("xmm0","xmm4"); # 0 = k diff --git a/src/crypto/aes/asm/vpaes-x86_64.pl b/src/crypto/aes/asm/vpaes-x86_64.pl index 3f99e368..7a24e0d6 100644 --- a/src/crypto/aes/asm/vpaes-x86_64.pl +++ b/src/crypto/aes/asm/vpaes-x86_64.pl @@ -31,6 +31,7 @@ # Nehalem 29.6/40.3/14.6 10.0/11.8 # Atom 57.3/74.2/32.1 60.9/77.2(***) # Silvermont 52.7/64.0/19.5 48.8/60.8(***) +# Goldmont 38.9/49.0/17.8 10.6/12.6 # # (*) "Hyper-threading" in the context refers rather to cache shared # among multiple cores, than to specifically Intel HTT. As vast @@ -164,7 +165,7 @@ _vpaes_encrypt_core: pshufb %xmm1, %xmm0 ret .size _vpaes_encrypt_core,.-_vpaes_encrypt_core - + ## ## Decryption core ## @@ -325,7 +326,7 @@ _vpaes_schedule_core: ## .Lschedule_128: mov \$10, %esi - + .Loop_schedule_128: call _vpaes_schedule_round dec %rsi @@ -359,7 +360,7 @@ _vpaes_schedule_core: .Loop_schedule_192: call _vpaes_schedule_round - palignr \$8,%xmm6,%xmm0 + palignr \$8,%xmm6,%xmm0 call _vpaes_schedule_mangle # save key n call _vpaes_schedule_192_smear call _vpaes_schedule_mangle # save key n+1 @@ -385,7 +386,7 @@ _vpaes_schedule_core: movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) call _vpaes_schedule_transform # input transform mov \$7, %esi - + .Loop_schedule_256: call _vpaes_schedule_mangle # output low result movdqa %xmm0, %xmm6 # save cur_lo in xmm6 @@ -394,7 +395,7 @@ _vpaes_schedule_core: call _vpaes_schedule_round dec %rsi jz .Lschedule_mangle_last - call _vpaes_schedule_mangle + call _vpaes_schedule_mangle # low round. swap xmm7 and xmm6 pshufd \$0xFF, %xmm0, %xmm0 @@ -402,10 +403,10 @@ _vpaes_schedule_core: movdqa %xmm6, %xmm7 call _vpaes_schedule_low_round movdqa %xmm5, %xmm7 - + jmp .Loop_schedule_256 - + ## ## .aes_schedule_mangle_last ## @@ -504,9 +505,9 @@ _vpaes_schedule_round: # rotate pshufd \$0xFF, %xmm0, %xmm0 palignr \$1, %xmm0, %xmm0 - + # fall through... - + # low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: # smear xmm7 @@ -545,7 +546,7 @@ _vpaes_schedule_low_round: pxor %xmm4, %xmm0 # 0 = sbox output # add in smeared stuff - pxor %xmm7, %xmm0 + pxor %xmm7, %xmm0 movdqa %xmm0, %xmm7 ret .size _vpaes_schedule_round,.-_vpaes_schedule_round diff --git a/src/crypto/asn1/CMakeLists.txt b/src/crypto/asn1/CMakeLists.txt index 25d8ba22..cd1ee8c2 100644 --- a/src/crypto/asn1/CMakeLists.txt +++ b/src/crypto/asn1/CMakeLists.txt @@ -35,6 +35,7 @@ add_library( tasn_new.c tasn_typ.c tasn_utl.c + time_support.c x_bignum.c x_long.c ) diff --git a/src/crypto/asn1/a_gentm.c b/src/crypto/asn1/a_gentm.c index 2f298689..d130cdf8 100644 --- a/src/crypto/asn1/a_gentm.c +++ b/src/crypto/asn1/a_gentm.c @@ -61,7 +61,6 @@ #include <openssl/err.h> #include <openssl/mem.h> -#include <openssl/time_support.h> #include "asn1_locl.h" diff --git a/src/crypto/asn1/a_time.c b/src/crypto/asn1/a_time.c index a12b38ff..4b584297 100644 --- a/src/crypto/asn1/a_time.c +++ b/src/crypto/asn1/a_time.c @@ -63,7 +63,6 @@ #include <openssl/buf.h> #include <openssl/err.h> #include <openssl/mem.h> -#include <openssl/time_support.h> #include "asn1_locl.h" diff --git a/src/crypto/asn1/a_utctm.c b/src/crypto/asn1/a_utctm.c index 3b9d2570..193b83f8 100644 --- a/src/crypto/asn1/a_utctm.c +++ b/src/crypto/asn1/a_utctm.c @@ -61,7 +61,6 @@ #include <openssl/err.h> #include <openssl/mem.h> -#include <openssl/time_support.h> #include "asn1_locl.h" diff --git a/src/crypto/asn1/asn1_locl.h b/src/crypto/asn1/asn1_locl.h index 982bfd60..ce8146bf 100644 --- a/src/crypto/asn1/asn1_locl.h +++ b/src/crypto/asn1/asn1_locl.h @@ -57,7 +57,42 @@ * */ +#ifndef OPENSSL_HEADER_ASN1_ASN1_LOCL_H +#define OPENSSL_HEADER_ASN1_ASN1_LOCL_H + +#include <time.h> + +#include <openssl/asn1.h> + +#if defined(__cplusplus) +extern "C" { +#endif + + +/* Wrapper functions for time functions. */ + +/* OPENSSL_gmtime wraps |gmtime_r|. See the manual page for that function. */ +struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result); + +/* OPENSSL_gmtime_adj updates |tm| by adding |offset_day| days and |offset_sec| + * seconds. */ +int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec); + +/* OPENSSL_gmtime_diff calculates the difference between |from| and |to| and + * outputs the difference as a number of days and seconds in |*out_days| and + * |*out_secs|. */ +int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from, + const struct tm *to); + + /* Internal ASN1 structures and functions: not for application use */ int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d); int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d); + + +#if defined(__cplusplus) +} /* extern C */ +#endif + +#endif /* OPENSSL_HEADER_ASN1_ASN1_LOCL_H */ diff --git a/src/crypto/asn1/tasn_dec.c b/src/crypto/asn1/tasn_dec.c index 40778a84..bf008af1 100644 --- a/src/crypto/asn1/tasn_dec.c +++ b/src/crypto/asn1/tasn_dec.c @@ -180,6 +180,7 @@ int ASN1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, long len, int ret = 0; ASN1_VALUE **pchptr, *ptmpval; int combine = aclass & ASN1_TFLG_COMBINE; + aclass &= ~ASN1_TFLG_COMBINE; if (!pval) return 0; if (aux && aux->asn1_cb) @@ -667,6 +668,7 @@ static int asn1_template_noexp_d2i(ASN1_VALUE **val, } len -= p - q; if (!sk_ASN1_VALUE_push((STACK_OF(ASN1_VALUE) *)*val, skfield)) { + ASN1_item_ex_free(&skfield, ASN1_ITEM_ptr(tt->item)); OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); goto err; } diff --git a/src/crypto/asn1/tasn_new.c b/src/crypto/asn1/tasn_new.c index 053b732b..10cf954f 100644 --- a/src/crypto/asn1/tasn_new.c +++ b/src/crypto/asn1/tasn_new.c @@ -160,7 +160,7 @@ static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it, } asn1_set_choice_selector(pval, -1, it); if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL)) - goto auxerr; + goto auxerr2; break; case ASN1_ITYPE_NDEF_SEQUENCE: @@ -188,10 +188,10 @@ static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it, for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { pseqval = asn1_get_field_ptr(pval, tt); if (!ASN1_template_new(pseqval, tt)) - goto memerr; + goto memerr2; } if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL)) - goto auxerr; + goto auxerr2; break; } #ifdef CRYPTO_MDEBUG @@ -200,18 +200,20 @@ static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it, #endif return 1; + memerr2: + ASN1_item_ex_free(pval, it); memerr: OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - ASN1_item_ex_free(pval, it); #ifdef CRYPTO_MDEBUG if (it->sname) CRYPTO_pop_info(); #endif return 0; + auxerr2: + ASN1_item_ex_free(pval, it); auxerr: OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR); - ASN1_item_ex_free(pval, it); #ifdef CRYPTO_MDEBUG if (it->sname) CRYPTO_pop_info(); diff --git a/src/crypto/time_support.c b/src/crypto/asn1/time_support.c index ae0f4963..194dc3a7 100644 --- a/src/crypto/time_support.c +++ b/src/crypto/asn1/time_support.c @@ -59,7 +59,7 @@ #define _POSIX_C_SOURCE 201410L /* for gmtime_r */ #endif -#include <openssl/time_support.h> +#include "asn1_locl.h" #include <time.h> diff --git a/src/crypto/bn/asm/armv4-mont.pl b/src/crypto/bn/asm/armv4-mont.pl index cad59551..d7298d2d 100644 --- a/src/crypto/bn/asm/armv4-mont.pl +++ b/src/crypto/bn/asm/armv4-mont.pl @@ -16,7 +16,7 @@ # [depending on key length, less for longer keys] on ARM920T, and # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code # base and compiler generated code with in-lined umull and even umlal -# instructions. The latter means that this code didn't really have an +# instructions. The latter means that this code didn't really have an # "advantage" of utilizing some "secret" instruction. # # The code is interoperable with Thumb ISA and is rather compact, less diff --git a/src/crypto/bn/asm/bn-586.pl b/src/crypto/bn/asm/bn-586.pl index 096bb9c9..ccc94519 100644 --- a/src/crypto/bn/asm/bn-586.pl +++ b/src/crypto/bn/asm/bn-586.pl @@ -47,7 +47,7 @@ sub bn_mul_add_words &movd("mm0",&wparam(3)); # mm0 = w &pxor("mm1","mm1"); # mm1 = carry_in &jmp(&label("maw_sse2_entry")); - + &set_label("maw_sse2_unrolled",16); &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] &paddq("mm1","mm3"); # mm1 = carry_in + r[0] @@ -668,20 +668,20 @@ sub bn_sub_part_words &adc($c,0); &mov(&DWP($i*4,$r,"",0),$tmp1); # *r } - + &comment(""); &add($b,32); &add($r,32); &sub($num,8); &jnz(&label("pw_neg_loop")); - + &set_label("pw_neg_finish",0); &mov($tmp2,&wparam(4)); # get dl &mov($num,0); &sub($num,$tmp2); &and($num,7); &jz(&label("pw_end")); - + for ($i=0; $i<7; $i++) { &comment("dl<0 Tail Round $i"); @@ -698,9 +698,9 @@ sub bn_sub_part_words } &jmp(&label("pw_end")); - + &set_label("pw_pos",0); - + &and($num,0xfffffff8); # num / 8 &jz(&label("pw_pos_finish")); @@ -715,18 +715,18 @@ sub bn_sub_part_words &mov(&DWP($i*4,$r,"",0),$tmp1); # *r &jnc(&label("pw_nc".$i)); } - + &comment(""); &add($a,32); &add($r,32); &sub($num,8); &jnz(&label("pw_pos_loop")); - + &set_label("pw_pos_finish",0); &mov($num,&wparam(4)); # get dl &and($num,7); &jz(&label("pw_end")); - + for ($i=0; $i<7; $i++) { &comment("dl>0 Tail Round $i"); @@ -747,17 +747,17 @@ sub bn_sub_part_words &mov(&DWP($i*4,$r,"",0),$tmp1); # *r &set_label("pw_nc".$i,0); } - + &comment(""); &add($a,32); &add($r,32); &sub($num,8); &jnz(&label("pw_nc_loop")); - + &mov($num,&wparam(4)); # get dl &and($num,7); &jz(&label("pw_nc_end")); - + for ($i=0; $i<7; $i++) { &mov($tmp1,&DWP($i*4,$a,"",0)); # *a diff --git a/src/crypto/bn/asm/co-586.pl b/src/crypto/bn/asm/co-586.pl index ec3ea343..c63e5622 100644 --- a/src/crypto/bn/asm/co-586.pl +++ b/src/crypto/bn/asm/co-586.pl @@ -41,7 +41,7 @@ sub mul_add_c &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b ### &adc($c2,0); - # is pos > 1, it means it is the last loop + # is pos > 1, it means it is the last loop &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a } @@ -70,7 +70,7 @@ sub sqr_add_c &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); ### &adc($c2,0); - # is pos > 1, it means it is the last loop + # is pos > 1, it means it is the last loop &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b } @@ -121,7 +121,7 @@ sub bn_mul_comba $c2="ebp"; $a="esi"; $b="edi"; - + $as=0; $ae=0; $bs=0; @@ -136,9 +136,9 @@ sub bn_mul_comba &push("ebx"); &xor($c0,$c0); - &mov("eax",&DWP(0,$a,"",0)); # load the first word + &mov("eax",&DWP(0,$a,"",0)); # load the first word &xor($c1,$c1); - &mov("edx",&DWP(0,$b,"",0)); # load the first second + &mov("edx",&DWP(0,$b,"",0)); # load the first second for ($i=0; $i<$tot; $i++) { @@ -146,7 +146,7 @@ sub bn_mul_comba $bi=$bs; $end=$be+1; - &comment("################## Calculate word $i"); + &comment("################## Calculate word $i"); for ($j=$bs; $j<$end; $j++) { diff --git a/src/crypto/bn/asm/rsaz-avx2.pl b/src/crypto/bn/asm/rsaz-avx2.pl index b8e830e2..5562d691 100755 --- a/src/crypto/bn/asm/rsaz-avx2.pl +++ b/src/crypto/bn/asm/rsaz-avx2.pl @@ -145,13 +145,21 @@ $code.=<<___; .type rsaz_1024_sqr_avx2,\@function,5 .align 64 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 +.cfi_startproc lea (%rsp), %rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 vzeroupper ___ $code.=<<___ if ($win64); @@ -170,6 +178,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rax,%rbp +.cfi_def_cfa_register %rbp mov %rdx, $np # reassigned argument sub \$$FrameSize, %rsp mov $np, $tmp @@ -359,7 +368,7 @@ $code.=<<___; vpaddq $TEMP1, $ACC1, $ACC1 vpmuludq 32*7-128($aap), $B2, $ACC2 vpbroadcastq 32*5-128($tpa), $B2 - vpaddq 32*11-448($tp1), $ACC2, $ACC2 + vpaddq 32*11-448($tp1), $ACC2, $ACC2 vmovdqu $ACC6, 32*6-192($tp0) vmovdqu $ACC7, 32*7-192($tp0) @@ -418,7 +427,7 @@ $code.=<<___; vmovdqu $ACC7, 32*16-448($tp1) lea 8($tp1), $tp1 - dec $i + dec $i jnz .LOOP_SQR_1024 ___ $ZERO = $ACC9; @@ -763,7 +772,7 @@ $code.=<<___; vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 - + vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 @@ -802,6 +811,7 @@ $code.=<<___; vzeroall mov %rbp, %rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); .Lsqr_1024_in_tail: @@ -818,14 +828,22 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lsqr_1024_epilogue: ret +.cfi_endproc .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 ___ } @@ -878,13 +896,21 @@ $code.=<<___; .type rsaz_1024_mul_avx2,\@function,5 .align 64 rsaz_1024_mul_avx2: +.cfi_startproc lea (%rsp), %rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 ___ $code.=<<___ if ($win64); vzeroupper @@ -903,6 +929,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rax,%rbp +.cfi_def_cfa_register %rbp vzeroall mov %rdx, $bp # reassigned argument sub \$64,%rsp @@ -1429,13 +1456,14 @@ $code.=<<___; vpaddq $TEMP4, $ACC8, $ACC8 vmovdqu $ACC4, 128-128($rp) - vmovdqu $ACC5, 160-128($rp) + vmovdqu $ACC5, 160-128($rp) vmovdqu $ACC6, 192-128($rp) vmovdqu $ACC7, 224-128($rp) vmovdqu $ACC8, 256-128($rp) vzeroupper mov %rbp, %rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); .Lmul_1024_in_tail: @@ -1452,14 +1480,22 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lmul_1024_epilogue: ret +.cfi_endproc .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 ___ } @@ -1578,8 +1614,10 @@ rsaz_1024_scatter5_avx2: .type rsaz_1024_gather5_avx2,\@abi-omnipotent .align 32 rsaz_1024_gather5_avx2: +.cfi_startproc vzeroupper mov %rsp,%r11 +.cfi_def_cfa_register %r11 ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax @@ -1717,11 +1755,13 @@ $code.=<<___ if ($win64); movaps -0x38(%r11),%xmm13 movaps -0x28(%r11),%xmm14 movaps -0x18(%r11),%xmm15 -.LSEH_end_rsaz_1024_gather5: ___ $code.=<<___; lea (%r11),%rsp +.cfi_def_cfa_register %rsp ret +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ } diff --git a/src/crypto/bn/asm/x86-mont.pl b/src/crypto/bn/asm/x86-mont.pl index 4b5d05db..57fbf10b 100644..100755 --- a/src/crypto/bn/asm/x86-mont.pl +++ b/src/crypto/bn/asm/x86-mont.pl @@ -32,7 +32,7 @@ require "x86asm.pl"; $output = pop; open STDOUT,">$output"; - + &asm_init($ARGV[0],$0); $sse2=0; @@ -66,33 +66,57 @@ $frame=32; # size of above frame rounded up to 16n &lea ("esi",&wparam(0)); # put aside pointer to argument block &lea ("edx",&wparam(1)); # load ap - &mov ("ebp","esp"); # saved stack pointer! &add ("edi",2); # extra two words on top of tp &neg ("edi"); - &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2)) + &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) &neg ("edi"); # minimize cache contention by arraning 2K window between stack # pointer and ap argument [np is also position sensitive vector, # but it's assumed to be near ap, as it's allocated at ~same # time]. - &mov ("eax","esp"); + &mov ("eax","ebp"); &sub ("eax","edx"); &and ("eax",2047); - &sub ("esp","eax"); # this aligns sp and ap modulo 2048 + &sub ("ebp","eax"); # this aligns sp and ap modulo 2048 - &xor ("edx","esp"); + &xor ("edx","ebp"); &and ("edx",2048); &xor ("edx",2048); - &sub ("esp","edx"); # this splits them apart modulo 4096 - - &and ("esp",-64); # align to cache line + &sub ("ebp","edx"); # this splits them apart modulo 4096 + + &and ("ebp",-64); # align to cache line + + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + &mov ("eax","esp"); + &sub ("eax","ebp"); + &and ("eax",-4096); + &mov ("edx","esp"); # saved stack pointer! + &lea ("esp",&DWP(0,"ebp","eax")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); + &jmp (&label("page_walk_done")); + +&set_label("page_walk",16); + &lea ("esp",&DWP(-4096,"esp")); + &mov ("eax",&DWP(0,"esp")); + &cmp ("esp","ebp"); + &ja (&label("page_walk")); +&set_label("page_walk_done"); ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp - &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np + &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0 #&mov ("edi",&DWP(5*4,"esi"));# int num @@ -100,11 +124,11 @@ $frame=32; # size of above frame rounded up to 16n &mov ($_rp,"eax"); # ... save a copy of argument block &mov ($_ap,"ebx"); &mov ($_bp,"ecx"); - &mov ($_np,"edx"); + &mov ($_np,"ebp"); &mov ($_n0,"esi"); &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling #&mov ($_num,$num); # redundant as $num is not reused - &mov ($_sp,"ebp"); # saved stack pointer! + &mov ($_sp,"edx"); # saved stack pointer! if($sse2) { $acc0="mm0"; # mmx register bank layout @@ -270,7 +294,7 @@ if (0) { &xor ("eax","eax"); # signal "not fast enough [yet]" &jmp (&label("just_leave")); # While the below code provides competitive performance for - # all key lengthes on modern Intel cores, it's still more + # all key lengths on modern Intel cores, it's still more # than 10% slower for 4096-bit key elsewhere:-( "Competitive" # means compared to the original integer-only assembler. # 512-bit RSA sign is better by ~40%, but that's about all @@ -573,15 +597,16 @@ $sbit=$num; &jge (&label("sub")); &sbb ("eax",0); # handle upmost overflow bit + &and ($tp,"eax"); + ¬ ("eax"); + &mov ($np,$rp); + &and ($np,"eax"); + &or ($tp,$np); # tp=carry?tp:rp &set_label("copy",16); # copy or in-place refresh - &mov ("edx",&DWP(0,$tp,$num,4)); - &mov ($np,&DWP(0,$rp,$num,4)); - &xor ("edx",$np); # conditional select - &and ("edx","eax"); - &xor ("edx",$np); - &mov (&DWP(0,$tp,$num,4),$j) # zap temporary vector - &mov (&DWP(0,$rp,$num,4),"edx"); # rp[i]=tp[i] + &mov ("eax",&DWP(0,$tp,$num,4)); + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector &dec ($num); &jge (&label("copy")); diff --git a/src/crypto/bn/asm/x86_64-mont.pl b/src/crypto/bn/asm/x86_64-mont.pl index 60e0111a..5775f658 100755 --- a/src/crypto/bn/asm/x86_64-mont.pl +++ b/src/crypto/bn/asm/x86_64-mont.pl @@ -84,6 +84,10 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: +.cfi_startproc + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d @@ -102,20 +106,50 @@ $code.=<<___; .align 16 .Lmul_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 - mov ${num}d,${num}d - lea 2($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2)) - and \$-1024,%rsp # minimize TLB usage + lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 +.Lmul_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: mov $bp,%r12 # reassign $bp ___ @@ -265,36 +299,46 @@ $code.=<<___; mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - dec $j # doesn't affect CF! + dec $j # doesnn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit xor $i,$i + and %rax,$ap + not %rax + mov $rp,$np + and %rax,$np mov $num,$j # j=num + or $np,$ap # ap=borrow?tp:rp .align 16 .Lcopy: # copy or in-place refresh - mov (%rsp,$i,8),$ap - mov ($rp,$i,8),$np - xor $np,$ap # conditional select: - and %rax,$ap # ((ap ^ np) & %rax) ^ np - xor $np,$ap # ap = borrow?tp:rp + mov ($ap,$i,8),%rax mov $i,(%rsp,$i,8) # zap temporary vector - mov $ap,($rp,$i,8) # rp[i]=tp[i] + mov %rax,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size bn_mul_mont,.-bn_mul_mont ___ {{{ @@ -304,6 +348,10 @@ $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: +.cfi_startproc + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -313,20 +361,41 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 - mov ${num}d,${num}d - lea 4($num),%r10 + neg $num mov %rsp,%r11 - neg %r10 - lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) - and \$-1024,%rsp # minimize TLB usage + lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) + neg $num # restore + and \$-1024,%r10 # minimize TLB usage - mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp @@ -633,9 +702,11 @@ ___ my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp + lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] + pxor %xmm0,%xmm0 mov 8(%rsp),@ri[1] # tp[1] - shr \$2,$num # num/=4 + shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! @@ -643,7 +714,6 @@ $code.=<<___; mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] - lea -1($num),$j # j=num/4-1 jmp .Lsub4x .align 16 .Lsub4x: @@ -671,50 +741,58 @@ $code.=<<___; mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] sbb \$0,@ri[0] # handle upmost overflow bit - mov @ri[0],%xmm0 - punpcklqdq %xmm0,%xmm0 # extend mask to 128 bits mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] xor $i,$i # i=0 - - mov $num,$j - pxor %xmm5,%xmm5 + and @ri[0],$ap + not @ri[0] + mov $rp,$np + and @ri[0],$np + lea -4($num),$j + or $np,$ap # ap=borrow?tp:rp + shr \$2,$j # j=num/4-1 + + movdqu ($ap),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,($rp) jmp .Lcopy4x .align 16 -.Lcopy4x: # copy or in-place refresh - movdqu (%rsp,$i),%xmm2 - movdqu 16(%rsp,$i),%xmm4 - movdqu ($rp,$i),%xmm1 - movdqu 16($rp,$i),%xmm3 - pxor %xmm1,%xmm2 # conditional select - pxor %xmm3,%xmm4 - pand %xmm0,%xmm2 - pand %xmm0,%xmm4 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - movdqu %xmm2,($rp,$i) - movdqu %xmm4,16($rp,$i) - movdqa %xmm5,(%rsp,$i) # zap temporary vectors - movdqa %xmm5,16(%rsp,$i) - +.Lcopy4x: # copy or in-place refresh + movdqu 16($ap,$i),%xmm2 + movdqu 32($ap,$i),%xmm1 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) + movdqa %xmm0,32(%rsp,$i) + movdqu %xmm1,32($rp,$i) lea 32($i),$i dec $j jnz .Lcopy4x - shl \$2,$num + movdqu 16($ap,$i),%xmm2 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) ___ } $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi, 8 mov \$1,%rax - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret +.cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont ___ }}} @@ -742,14 +820,23 @@ $code.=<<___; .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax +.Lsqr8x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes @@ -762,30 +849,49 @@ bn_sqr8x_mont: # do its job. # lea -64(%rsp,$num,2),%r11 + mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lsqr8x_sp_done: - and \$-64,%rsp + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 +.Lsqr8x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: + mov $num,%r10 neg $num mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lsqr8x_body: movq $nptr, %xmm2 # save pointer to modulus @@ -855,6 +961,7 @@ $code.=<<___; pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 @@ -884,14 +991,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lsqr8x_epilogue: ret +.cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ }}} @@ -903,23 +1018,48 @@ $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: -.Lmulx4x_enter: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes - .byte 0x67 xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 - lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) + lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) + and \$-128,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + lea ($bp,$num),%r10 - and \$-128,%rsp ############################################################## # Stack layout # +0 num @@ -939,6 +1079,7 @@ bn_mulx4x_mont: mov $n0, 24(%rsp) # save *n0 mov $rp, 32(%rsp) # save $rp mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 mov $num,48(%rsp) # inner counter jmp .Lmulx4x_body @@ -1188,6 +1329,7 @@ $code.=<<___; pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 @@ -1217,14 +1359,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret +.cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont ___ }}} @@ -1277,22 +1427,8 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - jmp .Lcommon_seh_tail + jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent @@ -1317,18 +1453,24 @@ sqr_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label - cmp %r10,%rbx # context->Rip<.Lsqr_body + cmp %r10,%rbx # context->Rip<.Lsqr_prologue jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # body label + cmp %r10,%rbx # context->Rip<.Lsqr_body + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -1415,13 +1557,15 @@ $code.=<<___; .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] + .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +.align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.align 8 ___ } diff --git a/src/crypto/bn/asm/x86_64-mont5.pl b/src/crypto/bn/asm/x86_64-mont5.pl index 61fde2d2..bf68aadd 100755 --- a/src/crypto/bn/asm/x86_64-mont5.pl +++ b/src/crypto/bn/asm/x86_64-mont5.pl @@ -73,6 +73,10 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: +.cfi_startproc + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax test \$7,${num}d jnz .Lmul_enter ___ @@ -84,24 +88,54 @@ $code.=<<___; .align 16 .Lmul_enter: - mov ${num}d,${num}d - mov %rsp,%rax movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument - lea .Linc(%rip),%r10 push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 - lea 2($num),%r11 - neg %r11 - lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) - and \$-1024,%rsp # minimize TLB usage + neg $num + mov %rsp,%r11 + lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage + + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.Lmul_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + lea .Linc(%rip),%r10 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; @@ -370,32 +404,42 @@ $code.=<<___; sbb \$0,%rax # handle upmost overflow bit xor $i,$i + and %rax,$ap + not %rax + mov $rp,$np + and %rax,$np mov $num,$j # j=num + or $np,$ap # ap=borrow?tp:rp .align 16 .Lcopy: # copy or in-place refresh - mov (%rsp,$i,8),$ap - mov ($rp,$i,8),$np - xor $np,$ap # conditional select: - and %rax,$ap # ((ap ^ np) & %rax) ^ np - xor $np,$ap # ap = borrow?tp:rp + mov ($ap,$i,8),%rax mov $i,(%rsp,$i,8) # zap temporary vector - mov $ap,($rp,$i,8) # rp[i]=tp[i] + mov %rax,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 ___ {{{ @@ -405,6 +449,10 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: +.cfi_startproc + .byte 0x67 + mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -413,14 +461,19 @@ $code.=<<___ if ($addx); je .Lmulx4x_enter ___ $code.=<<___; - .byte 0x67 - mov %rsp,%rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lmul4x_prologue: .byte 0x67 shl \$3,${num}d # convert $num to bytes @@ -437,43 +490,70 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $rp - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $rp + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmul4xsp_done: - and \$-64,%rsp + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + neg $num mov %rax,40(%rsp) +.cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent @@ -985,7 +1065,7 @@ my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 - # int pwr + # int pwr my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); @@ -997,6 +1077,9 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: +.cfi_startproc + mov %rsp,%rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -1005,13 +1088,19 @@ $code.=<<___ if ($addx); je .Lpowerx5_enter ___ $code.=<<___; - mov %rsp,%rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lpower5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num @@ -1026,25 +1115,42 @@ $code.=<<___; # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwr_sp_done: - and \$-64,%rsp - mov $num,%r10 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + + mov $num,%r10 neg $num ############################################################## @@ -1058,6 +1164,7 @@ $code.=<<___; # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr @@ -1084,16 +1191,25 @@ $code.=<<___; call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: ret +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -1953,7 +2069,7 @@ __bn_post4x_internal: jnz .Lsqr4x_sub mov $num,%r10 # prepare for back-to-back call - neg $num # restore $num + neg $num # restore $num ret .size __bn_post4x_internal,.-__bn_post4x_internal ___ @@ -1973,14 +2089,23 @@ bn_from_montgomery: .type bn_from_mont8x,\@function,6 .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lfrom_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -1995,25 +2120,42 @@ bn_from_mont8x: # last operation, we use the opportunity to cleanse it. # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lfrom_sp_done: - and \$-64,%rsp - mov $num,%r10 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + +.Lfrom_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: + + mov $num,%r10 neg $num ############################################################## @@ -2027,6 +2169,7 @@ bn_from_mont8x: # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lfrom_body: mov $num,%r11 lea 48(%rsp),%rax @@ -2070,7 +2213,6 @@ $code.=<<___ if ($addx); pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 @@ -2082,11 +2224,12 @@ $code.=<<___; pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 movdqa %xmm0,16*0(%rax) movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) @@ -2097,14 +2240,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: ret +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x ___ } @@ -2117,14 +2268,23 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: -.Lmulx4x_enter: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2141,23 +2301,40 @@ bn_mulx4x_mont_gather5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp -.Lmulx4xsp_done: - and \$-64,%rsp # ensure alignment + sub %r11,%rbp +.Lmulx4xsp_done: + and \$-64,%rbp # ensure alignment + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + ############################################################## # Stack layout # +0 -num @@ -2172,21 +2349,31 @@ bn_mulx4x_mont_gather5: # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret +.cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent @@ -2564,14 +2751,23 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: -.Lpowerx5_enter: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 +.Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes @@ -2586,25 +2782,42 @@ bn_powerx5: # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt - sub %r11,%rsp # align with $aptr - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 - lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) + lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwrx_sp_done: - and \$-64,%rsp - mov $num,%r10 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + mov $num,%r10 neg $num ############################################################## @@ -2625,6 +2838,7 @@ bn_powerx5: movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal @@ -2647,17 +2861,26 @@ bn_powerx5: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret +.cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal @@ -3513,9 +3736,14 @@ mul_handler: cmp %r10,%rbx # context->Rip<end of prologue label jb .Lcommon_seh_tail + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # beginning of body label + cmp %r10,%rbx # context->Rip<body label + jb .Lcommon_pop_regs + mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail @@ -3527,11 +3755,11 @@ mul_handler: mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - jmp .Lbody_proceed + jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer -.Lbody_proceed: +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3622,34 +3850,34 @@ $code.=<<___; .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] + .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] + .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler - .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] + .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] + .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 diff --git a/src/crypto/chacha/CMakeLists.txt b/src/crypto/chacha/CMakeLists.txt index 39d1defb..63de0611 100644 --- a/src/crypto/chacha/CMakeLists.txt +++ b/src/crypto/chacha/CMakeLists.txt @@ -42,17 +42,7 @@ add_library( ${CHACHA_ARCH_SOURCES} ) -add_executable( - chacha_test - - chacha_test.cc - $<TARGET_OBJECTS:test_support> -) - -target_link_libraries(chacha_test crypto) -add_dependencies(all_tests chacha_test) - perlasm(chacha-armv4.${ASM_EXT} asm/chacha-armv4.pl) perlasm(chacha-armv8.${ASM_EXT} asm/chacha-armv8.pl) perlasm(chacha-x86.${ASM_EXT} asm/chacha-x86.pl) -perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl)
\ No newline at end of file +perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl) diff --git a/src/crypto/chacha/asm/chacha-armv4.pl b/src/crypto/chacha/asm/chacha-armv4.pl index 395b8154..13698e3a 100755 --- a/src/crypto/chacha/asm/chacha-armv4.pl +++ b/src/crypto/chacha/asm/chacha-armv4.pl @@ -8,7 +8,7 @@ # ==================================================================== # # December 2014 -# +# # ChaCha20 for ARMv4. # # Performance in cycles per byte out of large buffer. @@ -713,7 +713,7 @@ ChaCha20_neon: vadd.i32 $d2,$d1,$t0 @ counter+2 str @t[3], [sp,#4*(16+15)] mov @t[3],#10 - add @x[12],@x[12],#3 @ counter+3 + add @x[12],@x[12],#3 @ counter+3 b .Loop_neon .align 4 @@ -1127,7 +1127,7 @@ $code.=<<___; ldrb @t[1],[r12],#1 @ read input subs @t[3],@t[3],#1 eor @t[0],@t[0],@t[1] - strb @t[0],[r14],#1 @ store ouput + strb @t[0],[r14],#1 @ store output bne .Loop_tail_neon .Ldone_neon: diff --git a/src/crypto/chacha/asm/chacha-armv8.pl b/src/crypto/chacha/asm/chacha-armv8.pl index 215d9657..c2d04298 100755 --- a/src/crypto/chacha/asm/chacha-armv8.pl +++ b/src/crypto/chacha/asm/chacha-armv8.pl @@ -8,7 +8,7 @@ # ==================================================================== # # June 2015 -# +# # ChaCha20 for ARMv8. # # Performance in cycles per byte out of large buffer. @@ -193,7 +193,7 @@ ChaCha20_ctr32: mov $ctr,#10 subs $len,$len,#64 .Loop: - sub $ctr,$ctr,#1 + sub $ctr,$ctr,#1 ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } diff --git a/src/crypto/chacha/asm/chacha-x86.pl b/src/crypto/chacha/asm/chacha-x86.pl index 984ce11e..f8bbb76d 100755 --- a/src/crypto/chacha/asm/chacha-x86.pl +++ b/src/crypto/chacha/asm/chacha-x86.pl @@ -21,7 +21,9 @@ # Westmere 9.50/+45% 3.35 # Sandy Bridge 10.5/+47% 3.20 # Haswell 8.15/+50% 2.83 +# Skylake 7.53/+22% 2.75 # Silvermont 17.4/+36% 8.35 +# Goldmont 13.4/+40% 4.36 # Sledgehammer 10.2/+54% # Bulldozer 13.4/+50% 4.38(*) # @@ -38,10 +40,8 @@ open STDOUT,">$output"; &asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386"); -$xmm=$ymm=0; -for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } - -$ymm=$xmm; +$xmm=$ymm=1; +$gasver=999; # enable everything $a="eax"; ($b,$b_)=("ebx","ebp"); @@ -438,6 +438,12 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous &label("pic_point"),"eax")); &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce +if (defined($gasver) && $gasver>=2.17) { # even though we encode + # pshufb manually, we + # handle only register + # operands, while this + # segment uses memory + # operand... &cmp ($len,64*4); &jb (&label("1x")); @@ -619,6 +625,7 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous &paddd ("xmm2",&QWP(16*6,"eax")); # +four &pand ("xmm3",&QWP(16*7,"eax")); &por ("xmm3","xmm2"); # counter value +} { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); diff --git a/src/crypto/chacha/asm/chacha-x86_64.pl b/src/crypto/chacha/asm/chacha-x86_64.pl index 55b726d2..5ab6f879 100755 --- a/src/crypto/chacha/asm/chacha-x86_64.pl +++ b/src/crypto/chacha/asm/chacha-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -11,6 +18,10 @@ # # ChaCha20 for x86_64. # +# December 2016 +# +# Add AVX512F code path. +# # Performance in cycles per byte out of large buffer. # # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2 @@ -21,7 +32,9 @@ # Sandy Bridge 8.31/+42% 5.45/6.76 2.72 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23 +# Skylake 5.87/+39% 4.70/- 2.31 1.19 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii) +# Goldmont 10.6/+17% 5.10/- 3.28 # Sledgehammer 7.28/+52% -/14.2(ii) - # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) # VIA Nano 10.5/+46% 6.72/8.60 6.05 @@ -82,6 +95,15 @@ $code.=<<___; .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Lsigma: .asciz "expand 32-byte k" +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" ___ @@ -207,6 +229,12 @@ ChaCha20_ctr32: cmp \$0,$len je .Lno_data mov OPENSSL_ia32cap_P+4(%rip),%r10 +___ +$code.=<<___ if ($avx>2); + bt \$48,%r10 # check for AVX512F + jc .LChaCha20_avx512 +___ +$code.=<<___; test \$`1<<(41-32)`,%r10d jnz .LChaCha20_ssse3 @@ -217,6 +245,7 @@ ChaCha20_ctr32: push %r14 push %r15 sub \$64+24,%rsp +.Lctr32_body: #movdqa .Lsigma(%rip),%xmm0 movdqu ($key),%xmm1 @@ -355,13 +384,14 @@ $code.=<<___; jnz .Loop_tail .Ldone: - add \$64+24,%rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + lea 64+24+48(%rsp),%rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lno_data: ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 @@ -396,31 +426,26 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &por ($b,$t); } -my $xframe = $win64 ? 32+32+8 : 24; +my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_ssse3,\@function,5 .align 32 ChaCha20_ssse3: .LChaCha20_ssse3: + mov %rsp,%r9 # frame pointer ___ $code.=<<___; cmp \$128,$len # we might throw away some data, ja .LChaCha20_4x # but overall it won't be slower .Ldo_sse3_after_all: - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,64+32(%rsp) - movaps %xmm7,64+48(%rsp) + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lssse3_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$a @@ -434,7 +459,7 @@ $code.=<<___; movdqa $b,0x10(%rsp) movdqa $c,0x20(%rsp) movdqa $d,0x30(%rsp) - mov \$10,%ebp + mov \$10,$counter # reuse $counter jmp .Loop_ssse3 .align 32 @@ -444,7 +469,7 @@ $code.=<<___; movdqa 0x10(%rsp),$b movdqa 0x20(%rsp),$c paddd 0x30(%rsp),$d - mov \$10,%ebp + mov \$10,$counter movdqa $d,0x30(%rsp) jmp .Loop_ssse3 @@ -462,7 +487,7 @@ ___ &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); - &dec ("%ebp"); + &dec ($counter); &jnz (".Loop_ssse3"); $code.=<<___; @@ -501,31 +526,26 @@ $code.=<<___; movdqa $b,0x10(%rsp) movdqa $c,0x20(%rsp) movdqa $d,0x30(%rsp) - xor %rbx,%rbx + xor $counter,$counter .Loop_tail_ssse3: - movzb ($inp,%rbx),%eax - movzb (%rsp,%rbx),%ecx - lea 1(%rbx),%rbx + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter xor %ecx,%eax - mov %al,-1($out,%rbx) + mov %al,-1($out,$counter) dec $len jnz .Loop_tail_ssse3 .Ldone_ssse3: ___ $code.=<<___ if ($win64); - movaps 64+32(%rsp),%xmm6 - movaps 64+48(%rsp),%xmm7 + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 ___ $code.=<<___; - add \$64+$xframe,%rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + lea (%r9),%rsp +.Lssse3_epilogue: ret .size ChaCha20_ssse3,.-ChaCha20_ssse3 ___ @@ -662,13 +682,14 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xa0 : 0; +my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4x,\@function,5 .align 32 ChaCha20_4x: .LChaCha20_4x: + mov %rsp,%r9 # frame pointer mov %r10,%r11 ___ $code.=<<___ if ($avx>1); @@ -685,8 +706,7 @@ $code.=<<___; je .Ldo_sse3_after_all # to detect Atom .Lproceed4x: - lea -0x78(%rsp),%r11 - sub \$0x148+$xframe,%rsp + sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] @@ -697,16 +717,17 @@ ___ # ... # +0x140 $code.=<<___ if ($win64); - movaps %xmm6,-0x30(%r11) - movaps %xmm7,-0x20(%r11) - movaps %xmm8,-0x10(%r11) - movaps %xmm9,0x00(%r11) - movaps %xmm10,0x10(%r11) - movaps %xmm11,0x20(%r11) - movaps %xmm12,0x30(%r11) - movaps %xmm13,0x40(%r11) - movaps %xmm14,0x50(%r11) - movaps %xmm15,0x60(%r11) + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4x_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$xa3 # key[0] @@ -1095,20 +1116,20 @@ $code.=<<___; .Ldone4x: ___ $code.=<<___ if ($win64); - lea 0x140+0x30(%rsp),%r11 - movaps -0x30(%r11),%xmm6 - movaps -0x20(%r11),%xmm7 - movaps -0x10(%r11),%xmm8 - movaps 0x00(%r11),%xmm9 - movaps 0x10(%r11),%xmm10 - movaps 0x20(%r11),%xmm11 - movaps 0x30(%r11),%xmm12 - movaps 0x40(%r11),%xmm13 - movaps 0x50(%r11),%xmm14 - movaps 0x60(%r11),%xmm15 + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 ___ $code.=<<___; - add \$0x148+$xframe,%rsp + lea (%r9),%rsp +.L4x_epilogue: ret .size ChaCha20_4x,.-ChaCha20_4x ___ @@ -1236,33 +1257,32 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xb0 : 8; +my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_8x,\@function,5 .align 32 ChaCha20_8x: .LChaCha20_8x: - mov %rsp,%r10 + mov %rsp,%r9 # frame register sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ $code.=<<___ if ($win64); - lea 0x290+0x30(%rsp),%r11 - movaps %xmm6,-0x30(%r11) - movaps %xmm7,-0x20(%r11) - movaps %xmm8,-0x10(%r11) - movaps %xmm9,0x00(%r11) - movaps %xmm10,0x10(%r11) - movaps %xmm11,0x20(%r11) - movaps %xmm12,0x30(%r11) - movaps %xmm13,0x40(%r11) - movaps %xmm14,0x50(%r11) - movaps %xmm15,0x60(%r11) + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L8x_body: ___ $code.=<<___; vzeroupper - mov %r10,0x280(%rsp) ################ stack layout # +0x00 SIMD equivalent of @x[8-12] @@ -1271,7 +1291,7 @@ $code.=<<___; # ... # +0x200 SIMD counters (with nonce smashed by lanes) # ... - # +0x280 saved %rsp + # +0x280 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] @@ -1737,29 +1757,989 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - lea 0x290+0x30(%rsp),%r11 - movaps -0x30(%r11),%xmm6 - movaps -0x20(%r11),%xmm7 - movaps -0x10(%r11),%xmm8 - movaps 0x00(%r11),%xmm9 - movaps 0x10(%r11),%xmm10 - movaps 0x20(%r11),%xmm11 - movaps 0x30(%r11),%xmm12 - movaps 0x40(%r11),%xmm13 - movaps 0x50(%r11),%xmm14 - movaps 0x60(%r11),%xmm15 + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 ___ $code.=<<___; - mov 0x280(%rsp),%rsp + lea (%r9),%rsp +.L8x_epilogue: ret .size ChaCha20_8x,.-ChaCha20_8x ___ } +######################################################################## +# AVX512 code paths +if ($avx>2) { +# This one handles shorter inputs... + +my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); +my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round + &vpaddd ($a,$a,$b); + &vpxord ($d,$d,$a); + &vprold ($d,$d,16); + + &vpaddd ($c,$c,$d); + &vpxord ($b,$b,$c); + &vprold ($b,$b,12); + + &vpaddd ($a,$a,$b); + &vpxord ($d,$d,$a); + &vprold ($d,$d,8); + + &vpaddd ($c,$c,$d); + &vpxord ($b,$b,$c); + &vprold ($b,$b,7); +} + +my $xframe = $win64 ? 32+8 : 8; + +$code.=<<___; +.type ChaCha20_avx512,\@function,5 +.align 32 +ChaCha20_avx512: +.LChaCha20_avx512: + mov %rsp,%r9 # frame pointer + cmp \$512,$len + ja .LChaCha20_16x + + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lavx512_body: +___ +$code.=<<___; + vbroadcasti32x4 .Lsigma(%rip),$a + vbroadcasti32x4 ($key),$b + vbroadcasti32x4 16($key),$c + vbroadcasti32x4 ($counter),$d + + vmovdqa32 $a,$a_ + vmovdqa32 $b,$b_ + vmovdqa32 $c,$c_ + vpaddd .Lzeroz(%rip),$d,$d + vmovdqa32 .Lfourz(%rip),$fourz + mov \$10,$counter # reuse $counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512 + +.align 16 +.Loop_outer_avx512: + vmovdqa32 $a_,$a + vmovdqa32 $b_,$b + vmovdqa32 $c_,$c + vpaddd $fourz,$d_,$d + mov \$10,$counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: +___ + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b00111001); + &vpshufd ($d,$d,0b10010011); + + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b10010011); + &vpshufd ($d,$d,0b00111001); + + &dec ($counter); + &jnz (".Loop_avx512"); + +$code.=<<___; + vpaddd $a_,$a,$a + vpaddd $b_,$b,$b + vpaddd $c_,$c,$c + vpaddd $d_,$d,$d + + sub \$64,$len + jb .Ltail64_avx512 + + vpxor 0x00($inp),%x#$a,$t0 # xor with input + vpxor 0x10($inp),%x#$b,$t1 + vpxor 0x20($inp),%x#$c,$t2 + vpxor 0x30($inp),%x#$d,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$1,$a,$t0 + vextracti32x4 \$1,$b,$t1 + vextracti32x4 \$1,$c,$t2 + vextracti32x4 \$1,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$2,$a,$t0 + vextracti32x4 \$2,$b,$t1 + vextracti32x4 \$2,$c,$t2 + vextracti32x4 \$2,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$3,$a,$t0 + vextracti32x4 \$3,$b,$t1 + vextracti32x4 \$3,$c,$t2 + vextracti32x4 \$3,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jnz .Loop_outer_avx512 + + jmp .Ldone_avx512 + +.align 16 +.Ltail64_avx512: + vmovdqa %x#$a,0x00(%rsp) + vmovdqa %x#$b,0x10(%rsp) + vmovdqa %x#$c,0x20(%rsp) + vmovdqa %x#$d,0x30(%rsp) + add \$64,$len + jmp .Loop_tail_avx512 + +.align 16 +.Ltail_avx512: + vmovdqa $t0,0x00(%rsp) + vmovdqa $t1,0x10(%rsp) + vmovdqa $t2,0x20(%rsp) + vmovdqa $t3,0x30(%rsp) + add \$64,$len + +.Loop_tail_avx512: + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter + xor %ecx,%eax + mov %al,-1($out,$counter) + dec $len + jnz .Loop_tail_avx512 + + vmovdqa32 $a_,0x00(%rsp) + +.Ldone_avx512: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 +___ +$code.=<<___; + lea (%r9),%rsp +.Lavx512_epilogue: + ret +.size ChaCha20_avx512,.-ChaCha20_avx512 +___ +} +if ($avx>2) { +# This one handles longer inputs... + +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +my @key=map("%zmm$_",(16..31)); +my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; + +sub AVX512_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my @x=map("\"$_\"",@xx); + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxord (@x[$d0],@x[$d0],@x[$a0])", + "&vpxord (@x[$d1],@x[$d1],@x[$a1])", + "&vpxord (@x[$d2],@x[$d2],@x[$a2])", + "&vpxord (@x[$d3],@x[$d3],@x[$a3])", + "&vprold (@x[$d0],@x[$d0],16)", + "&vprold (@x[$d1],@x[$d1],16)", + "&vprold (@x[$d2],@x[$d2],16)", + "&vprold (@x[$d3],@x[$d3],16)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxord (@x[$b0],@x[$b0],@x[$c0])", + "&vpxord (@x[$b1],@x[$b1],@x[$c1])", + "&vpxord (@x[$b2],@x[$b2],@x[$c2])", + "&vpxord (@x[$b3],@x[$b3],@x[$c3])", + "&vprold (@x[$b0],@x[$b0],12)", + "&vprold (@x[$b1],@x[$b1],12)", + "&vprold (@x[$b2],@x[$b2],12)", + "&vprold (@x[$b3],@x[$b3],12)", + + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxord (@x[$d0],@x[$d0],@x[$a0])", + "&vpxord (@x[$d1],@x[$d1],@x[$a1])", + "&vpxord (@x[$d2],@x[$d2],@x[$a2])", + "&vpxord (@x[$d3],@x[$d3],@x[$a3])", + "&vprold (@x[$d0],@x[$d0],8)", + "&vprold (@x[$d1],@x[$d1],8)", + "&vprold (@x[$d2],@x[$d2],8)", + "&vprold (@x[$d3],@x[$d3],8)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxord (@x[$b0],@x[$b0],@x[$c0])", + "&vpxord (@x[$b1],@x[$b1],@x[$c1])", + "&vpxord (@x[$b2],@x[$b2],@x[$c2])", + "&vpxord (@x[$b3],@x[$b3],@x[$c3])", + "&vprold (@x[$b0],@x[$b0],7)", + "&vprold (@x[$b1],@x[$b1],7)", + "&vprold (@x[$b2],@x[$b2],7)", + "&vprold (@x[$b3],@x[$b3],7)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_16x,\@function,5 +.align 32 +ChaCha20_16x: +.LChaCha20_16x: + mov %rsp,%r9 # frame register + sub \$64+$xframe,%rsp + and \$-64,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L16x_body: +___ +$code.=<<___; + vzeroupper + + lea .Lsigma(%rip),%r10 + vbroadcasti32x4 (%r10),$xa3 # key[0] + vbroadcasti32x4 ($key),$xb3 # key[1] + vbroadcasti32x4 16($key),$xc3 # key[2] + vbroadcasti32x4 ($counter),$xd3 # key[3] + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vpshufd \$0xaa,$xa3,$xa2 + vpshufd \$0xff,$xa3,$xa3 + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vpshufd \$0xaa,$xb3,$xb2 + vpshufd \$0xff,$xb3,$xb3 + vmovdqa64 $xb0,@key[4] + vmovdqa64 $xb1,@key[5] + vmovdqa64 $xb2,@key[6] + vmovdqa64 $xb3,@key[7] + + vpshufd \$0x00,$xc3,$xc0 + vpshufd \$0x55,$xc3,$xc1 + vpshufd \$0xaa,$xc3,$xc2 + vpshufd \$0xff,$xc3,$xc3 + vmovdqa64 $xc0,@key[8] + vmovdqa64 $xc1,@key[9] + vmovdqa64 $xc2,@key[10] + vmovdqa64 $xc3,@key[11] + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpshufd \$0xaa,$xd3,$xd2 + vpshufd \$0xff,$xd3,$xd3 + vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet + vmovdqa64 $xd0,@key[12] + vmovdqa64 $xd1,@key[13] + vmovdqa64 $xd2,@key[14] + vmovdqa64 $xd3,@key[15] + + mov \$10,%eax + jmp .Loop16x + +.align 32 +.Loop_outer16x: + vpbroadcastd 0(%r10),$xa0 # reload key + vpbroadcastd 4(%r10),$xa1 + vpbroadcastd 8(%r10),$xa2 + vpbroadcastd 12(%r10),$xa3 + vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters + vmovdqa64 @key[4],$xb0 + vmovdqa64 @key[5],$xb1 + vmovdqa64 @key[6],$xb2 + vmovdqa64 @key[7],$xb3 + vmovdqa64 @key[8],$xc0 + vmovdqa64 @key[9],$xc1 + vmovdqa64 @key[10],$xc2 + vmovdqa64 @key[11],$xc3 + vmovdqa64 @key[12],$xd0 + vmovdqa64 @key[13],$xd1 + vmovdqa64 @key[14],$xd2 + vmovdqa64 @key[15],$xd3 + + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + mov \$10,%eax + jmp .Loop16x + +.align 32 +.Loop16x: +___ + foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop16x + + vpaddd @key[0],$xa0,$xa0 # accumulate key + vpaddd @key[1],$xa1,$xa1 + vpaddd @key[2],$xa2,$xa2 + vpaddd @key[3],$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd @key[4],$xb0,$xb0 + vpaddd @key[5],$xb1,$xb1 + vpaddd @key[6],$xb2,$xb2 + vpaddd @key[7],$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further + vshufi32x4 \$0xee,$xb0,$xa0,$xb0 + vshufi32x4 \$0x44,$xb1,$xa1,$xa0 + vshufi32x4 \$0xee,$xb1,$xa1,$xb1 + vshufi32x4 \$0x44,$xb2,$xa2,$xa1 + vshufi32x4 \$0xee,$xb2,$xa2,$xb2 + vshufi32x4 \$0x44,$xb3,$xa3,$xa2 + vshufi32x4 \$0xee,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); +$code.=<<___; + vpaddd @key[8],$xc0,$xc0 + vpaddd @key[9],$xc1,$xc1 + vpaddd @key[10],$xc2,$xc2 + vpaddd @key[11],$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd @key[12],$xd0,$xd0 + vpaddd @key[13],$xd1,$xd1 + vpaddd @key[14],$xd2,$xd2 + vpaddd @key[15],$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further + vshufi32x4 \$0xee,$xd0,$xc0,$xd0 + vshufi32x4 \$0x44,$xd1,$xc1,$xc0 + vshufi32x4 \$0xee,$xd1,$xc1,$xd1 + vshufi32x4 \$0x44,$xd2,$xc2,$xc1 + vshufi32x4 \$0xee,$xd2,$xc2,$xd2 + vshufi32x4 \$0x44,$xd3,$xc3,$xc2 + vshufi32x4 \$0xee,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); +$code.=<<___; + vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further + vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 + vshufi32x4 \$0x88,$xd0,$xb0,$xc0 + vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 + vshufi32x4 \$0x88,$xc1,$xa1,$xt1 + vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 + vshufi32x4 \$0x88,$xd1,$xb1,$xc1 + vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 + vshufi32x4 \$0x88,$xc2,$xa2,$xt2 + vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 + vshufi32x4 \$0x88,$xd2,$xb2,$xc2 + vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 + vshufi32x4 \$0x88,$xc3,$xa3,$xt3 + vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 + vshufi32x4 \$0x88,$xd3,$xb3,$xc3 + vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 +___ + ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= + ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); + + ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, + $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = + ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +$code.=<<___; + cmp \$64*16,$len + jb .Ltail16x + + vpxord 0x00($inp),$xa0,$xa0 # xor with input + vpxord 0x40($inp),$xb0,$xb0 + vpxord 0x80($inp),$xc0,$xc0 + vpxord 0xc0($inp),$xd0,$xd0 + vmovdqu32 $xa0,0x00($out) + vmovdqu32 $xb0,0x40($out) + vmovdqu32 $xc0,0x80($out) + vmovdqu32 $xd0,0xc0($out) + + vpxord 0x100($inp),$xa1,$xa1 + vpxord 0x140($inp),$xb1,$xb1 + vpxord 0x180($inp),$xc1,$xc1 + vpxord 0x1c0($inp),$xd1,$xd1 + vmovdqu32 $xa1,0x100($out) + vmovdqu32 $xb1,0x140($out) + vmovdqu32 $xc1,0x180($out) + vmovdqu32 $xd1,0x1c0($out) + + vpxord 0x200($inp),$xa2,$xa2 + vpxord 0x240($inp),$xb2,$xb2 + vpxord 0x280($inp),$xc2,$xc2 + vpxord 0x2c0($inp),$xd2,$xd2 + vmovdqu32 $xa2,0x200($out) + vmovdqu32 $xb2,0x240($out) + vmovdqu32 $xc2,0x280($out) + vmovdqu32 $xd2,0x2c0($out) + + vpxord 0x300($inp),$xa3,$xa3 + vpxord 0x340($inp),$xb3,$xb3 + vpxord 0x380($inp),$xc3,$xc3 + vpxord 0x3c0($inp),$xd3,$xd3 + lea 0x400($inp),$inp + vmovdqu32 $xa3,0x300($out) + vmovdqu32 $xb3,0x340($out) + vmovdqu32 $xc3,0x380($out) + vmovdqu32 $xd3,0x3c0($out) + lea 0x400($out),$out + + sub \$64*16,$len + jnz .Loop_outer16x + + jmp .Ldone16x + +.align 32 +.Ltail16x: + xor %r10,%r10 + sub $inp,$out + cmp \$64*1,$len + jb .Less_than_64_16x + vpxord ($inp),$xa0,$xa0 # xor with input + vmovdqu32 $xa0,($out,$inp) + je .Ldone16x + vmovdqa32 $xb0,$xa0 + lea 64($inp),$inp + + cmp \$64*2,$len + jb .Less_than_64_16x + vpxord ($inp),$xb0,$xb0 + vmovdqu32 $xb0,($out,$inp) + je .Ldone16x + vmovdqa32 $xc0,$xa0 + lea 64($inp),$inp + + cmp \$64*3,$len + jb .Less_than_64_16x + vpxord ($inp),$xc0,$xc0 + vmovdqu32 $xc0,($out,$inp) + je .Ldone16x + vmovdqa32 $xd0,$xa0 + lea 64($inp),$inp + + cmp \$64*4,$len + jb .Less_than_64_16x + vpxord ($inp),$xd0,$xd0 + vmovdqu32 $xd0,($out,$inp) + je .Ldone16x + vmovdqa32 $xa1,$xa0 + lea 64($inp),$inp + + cmp \$64*5,$len + jb .Less_than_64_16x + vpxord ($inp),$xa1,$xa1 + vmovdqu32 $xa1,($out,$inp) + je .Ldone16x + vmovdqa32 $xb1,$xa0 + lea 64($inp),$inp + + cmp \$64*6,$len + jb .Less_than_64_16x + vpxord ($inp),$xb1,$xb1 + vmovdqu32 $xb1,($out,$inp) + je .Ldone16x + vmovdqa32 $xc1,$xa0 + lea 64($inp),$inp + + cmp \$64*7,$len + jb .Less_than_64_16x + vpxord ($inp),$xc1,$xc1 + vmovdqu32 $xc1,($out,$inp) + je .Ldone16x + vmovdqa32 $xd1,$xa0 + lea 64($inp),$inp + + cmp \$64*8,$len + jb .Less_than_64_16x + vpxord ($inp),$xd1,$xd1 + vmovdqu32 $xd1,($out,$inp) + je .Ldone16x + vmovdqa32 $xa2,$xa0 + lea 64($inp),$inp + + cmp \$64*9,$len + jb .Less_than_64_16x + vpxord ($inp),$xa2,$xa2 + vmovdqu32 $xa2,($out,$inp) + je .Ldone16x + vmovdqa32 $xb2,$xa0 + lea 64($inp),$inp + + cmp \$64*10,$len + jb .Less_than_64_16x + vpxord ($inp),$xb2,$xb2 + vmovdqu32 $xb2,($out,$inp) + je .Ldone16x + vmovdqa32 $xc2,$xa0 + lea 64($inp),$inp + + cmp \$64*11,$len + jb .Less_than_64_16x + vpxord ($inp),$xc2,$xc2 + vmovdqu32 $xc2,($out,$inp) + je .Ldone16x + vmovdqa32 $xd2,$xa0 + lea 64($inp),$inp + + cmp \$64*12,$len + jb .Less_than_64_16x + vpxord ($inp),$xd2,$xd2 + vmovdqu32 $xd2,($out,$inp) + je .Ldone16x + vmovdqa32 $xa3,$xa0 + lea 64($inp),$inp + + cmp \$64*13,$len + jb .Less_than_64_16x + vpxord ($inp),$xa3,$xa3 + vmovdqu32 $xa3,($out,$inp) + je .Ldone16x + vmovdqa32 $xb3,$xa0 + lea 64($inp),$inp + + cmp \$64*14,$len + jb .Less_than_64_16x + vpxord ($inp),$xb3,$xb3 + vmovdqu32 $xb3,($out,$inp) + je .Ldone16x + vmovdqa32 $xc3,$xa0 + lea 64($inp),$inp + + cmp \$64*15,$len + jb .Less_than_64_16x + vpxord ($inp),$xc3,$xc3 + vmovdqu32 $xc3,($out,$inp) + je .Ldone16x + vmovdqa32 $xd3,$xa0 + lea 64($inp),$inp + +.Less_than_64_16x: + vmovdqa32 $xa0,0x00(%rsp) + lea ($out,$inp),$out + and \$63,$len + +.Loop_tail16x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail16x + + vpxord $xa0,$xa0,$xa0 + vmovdqa32 $xa0,0(%rsp) + +.Ldone16x: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.L16x_epilogue: + ret +.size ChaCha20_16x,.-ChaCha20_16x +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + lea .Lctr32_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lno_data(%rip),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 64+24+48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 192($context),%rax # pull context->R9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0x28(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$4,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size ssse3_handler,.-ssse3_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<prologue label + jb .Lcommon_seh_tail + + mov 192($context),%rax # pull context->R9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0xa8(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ChaCha20_ctr32 + .rva .LSEH_end_ChaCha20_ctr32 + .rva .LSEH_info_ChaCha20_ctr32 + + .rva .LSEH_begin_ChaCha20_ssse3 + .rva .LSEH_end_ChaCha20_ssse3 + .rva .LSEH_info_ChaCha20_ssse3 + + .rva .LSEH_begin_ChaCha20_4x + .rva .LSEH_end_ChaCha20_4x + .rva .LSEH_info_ChaCha20_4x +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ChaCha20_8x + .rva .LSEH_end_ChaCha20_8x + .rva .LSEH_info_ChaCha20_8x +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_ChaCha20_avx512 + .rva .LSEH_end_ChaCha20_avx512 + .rva .LSEH_info_ChaCha20_avx512 + + .rva .LSEH_begin_ChaCha20_16x + .rva .LSEH_end_ChaCha20_16x + .rva .LSEH_info_ChaCha20_16x +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_ChaCha20_ctr32: + .byte 9,0,0,0 + .rva se_handler + +.LSEH_info_ChaCha20_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lssse3_body,.Lssse3_epilogue + +.LSEH_info_ChaCha20_4x: + .byte 9,0,0,0 + .rva full_handler + .rva .L4x_body,.L4x_epilogue +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ChaCha20_8x: + .byte 9,0,0,0 + .rva full_handler + .rva .L8x_body,.L8x_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_ChaCha20_avx512: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] + +.LSEH_info_ChaCha20_16x: + .byte 9,0,0,0 + .rva full_handler + .rva .L16x_body,.L16x_epilogue # HandlerData[] +___ +} + foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; + s/\`([^\`]*)\`/eval $1/ge; - s/%x#%y/%x/go; + s/%x#%[yz]/%x/g; # "down-shift" print $_,"\n"; } diff --git a/src/crypto/chacha/chacha_test.cc b/src/crypto/chacha/chacha_test.cc index 6bfb03eb..a40653fa 100644 --- a/src/crypto/chacha/chacha_test.cc +++ b/src/crypto/chacha/chacha_test.cc @@ -18,10 +18,13 @@ #include <memory> +#include <gtest/gtest.h> + #include <openssl/crypto.h> #include <openssl/chacha.h> #include "../internal.h" +#include "../test/test_util.h" static const uint8_t kKey[32] = { @@ -216,35 +219,18 @@ static const uint8_t kOutput[] = { static_assert(sizeof(kInput) == sizeof(kOutput), "Input and output lengths don't match."); -static bool TestChaCha20(size_t len) { - std::unique_ptr<uint8_t[]> buf(new uint8_t[len]); - CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter); - if (OPENSSL_memcmp(buf.get(), kOutput, len) != 0) { - fprintf(stderr, "Mismatch at length %zu.\n", len); - return false; - } - - // Test in-place. - OPENSSL_memcpy(buf.get(), kInput, len); - CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter); - if (OPENSSL_memcmp(buf.get(), kOutput, len) != 0) { - fprintf(stderr, "Mismatch at length %zu, in-place.\n", len); - return false; - } - - return true; -} - -int main(int argc, char **argv) { - CRYPTO_library_init(); - +TEST(ChaChaTest, TestVector) { // Run the test with the test vector at all lengths. for (size_t len = 0; len <= sizeof(kInput); len++) { - if (!TestChaCha20(len)) { - return 1; - } - } + SCOPED_TRACE(len); - printf("PASS\n"); - return 0; + std::unique_ptr<uint8_t[]> buf(new uint8_t[len]); + CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter); + EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len)); + + // Test the in-place version. + OPENSSL_memcpy(buf.get(), kInput, len); + CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter); + EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len)); + } } diff --git a/src/crypto/cipher/cipher.c b/src/crypto/cipher/cipher.c index ae045aef..e46e43ef 100644 --- a/src/crypto/cipher/cipher.c +++ b/src/crypto/cipher/cipher.c @@ -132,6 +132,7 @@ int EVP_CIPHER_CTX_copy(EVP_CIPHER_CTX *out, const EVP_CIPHER_CTX *in) { if (in->cipher_data && in->cipher->ctx_size) { out->cipher_data = OPENSSL_malloc(in->cipher->ctx_size); if (!out->cipher_data) { + out->cipher = NULL; OPENSSL_PUT_ERROR(CIPHER, ERR_R_MALLOC_FAILURE); return 0; } @@ -139,7 +140,10 @@ int EVP_CIPHER_CTX_copy(EVP_CIPHER_CTX *out, const EVP_CIPHER_CTX *in) { } if (in->cipher->flags & EVP_CIPH_CUSTOM_COPY) { - return in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out); + if (!in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out)) { + out->cipher = NULL; + return 0; + } } return 1; diff --git a/src/crypto/ec/CMakeLists.txt b/src/crypto/ec/CMakeLists.txt index a54075c3..75dccec8 100644 --- a/src/crypto/ec/CMakeLists.txt +++ b/src/crypto/ec/CMakeLists.txt @@ -39,14 +39,6 @@ add_executable( ) add_executable( - ec_test - - ec_test.cc - - $<TARGET_OBJECTS:test_support> -) - -add_executable( p256-x86_64_test p256-x86_64_test.cc @@ -55,6 +47,5 @@ add_executable( ) target_link_libraries(example_mul crypto) -target_link_libraries(ec_test crypto) target_link_libraries(p256-x86_64_test crypto) -add_dependencies(all_tests example_mul ec_test p256-x86_64_test) +add_dependencies(all_tests example_mul p256-x86_64_test) diff --git a/src/crypto/ec/asm/p256-x86_64-asm.pl b/src/crypto/ec/asm/p256-x86_64-asm.pl index 3cd7b01f..517c506d 100755 --- a/src/crypto/ec/asm/p256-x86_64-asm.pl +++ b/src/crypto/ec/asm/p256-x86_64-asm.pl @@ -289,7 +289,7 @@ __ecp_nistz256_mul_montq: adc \$0, $acc0 ######################################################################## - # Second reduction step + # Second reduction step mov $acc1, $t1 shl \$32, $acc1 mulq $poly3 @@ -336,7 +336,7 @@ __ecp_nistz256_mul_montq: adc \$0, $acc1 ######################################################################## - # Third reduction step + # Third reduction step mov $acc2, $t1 shl \$32, $acc2 mulq $poly3 @@ -383,7 +383,7 @@ __ecp_nistz256_mul_montq: adc \$0, $acc2 ######################################################################## - # Final reduction step + # Final reduction step mov $acc3, $t1 shl \$32, $acc3 mulq $poly3 @@ -396,7 +396,7 @@ __ecp_nistz256_mul_montq: mov $acc5, $t1 adc \$0, $acc2 - ######################################################################## + ######################################################################## # Branch-less conditional subtraction of P sub \$-1, $acc4 # .Lpoly[0] mov $acc0, $t2 @@ -1649,7 +1649,7 @@ $code.=<<___; movq %xmm1, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); ___ -{ +{ ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## # operate in 4-5-6-7 "name space" that matches squaring output # @@ -1738,7 +1738,7 @@ $code.=<<___; lea $M(%rsp), $b_ptr mov $acc4, $acc6 # harmonize sub output and mul input xor %ecx, %ecx - mov $acc4, $S+8*0(%rsp) # have to save:-( + mov $acc4, $S+8*0(%rsp) # have to save:-( mov $acc5, $acc2 mov $acc5, $S+8*1(%rsp) cmovz $acc0, $acc3 diff --git a/src/crypto/ec/ec_test.cc b/src/crypto/ec/ec_test.cc index 31619b1e..02b9ef20 100644 --- a/src/crypto/ec/ec_test.cc +++ b/src/crypto/ec/ec_test.cc @@ -17,6 +17,8 @@ #include <vector> +#include <gtest/gtest.h> + #include <openssl/bn.h> #include <openssl/bytestring.h> #include <openssl/crypto.h> @@ -24,6 +26,9 @@ #include <openssl/err.h> #include <openssl/mem.h> #include <openssl/nid.h> +#include <openssl/obj.h> + +#include "../test/test_util.h" // kECKeyWithoutPublic is an ECPrivateKey with the optional publicKey field @@ -123,201 +128,75 @@ static bool EncodeECPrivateKey(std::vector<uint8_t> *out, const EC_KEY *key) { return true; } -static bool Testd2i_ECPrivateKey() { - bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeyWithoutPublic, - sizeof(kECKeyWithoutPublic)); - if (!key) { - fprintf(stderr, "Failed to parse private key.\n"); - ERR_print_errors_fp(stderr); - return false; - } +TEST(ECTest, Encoding) { + bssl::UniquePtr<EC_KEY> key = + DecodeECPrivateKey(kECKeyWithoutPublic, sizeof(kECKeyWithoutPublic)); + ASSERT_TRUE(key); + // Test that the encoding round-trips. std::vector<uint8_t> out; - if (!EncodeECPrivateKey(&out, key.get())) { - fprintf(stderr, "Failed to serialize private key.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - if (std::vector<uint8_t>(kECKeyWithoutPublic, - kECKeyWithoutPublic + sizeof(kECKeyWithoutPublic)) != - out) { - fprintf(stderr, "Serialisation of key doesn't match original.\n"); - return false; - } + ASSERT_TRUE(EncodeECPrivateKey(&out, key.get())); + EXPECT_EQ(Bytes(kECKeyWithoutPublic), Bytes(out.data(), out.size())); const EC_POINT *pub_key = EC_KEY_get0_public_key(key.get()); - if (pub_key == NULL) { - fprintf(stderr, "Public key missing.\n"); - return false; - } + ASSERT_TRUE(pub_key) << "Public key missing"; bssl::UniquePtr<BIGNUM> x(BN_new()); bssl::UniquePtr<BIGNUM> y(BN_new()); - if (!x || !y) { - return false; - } - if (!EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()), - pub_key, x.get(), y.get(), NULL)) { - fprintf(stderr, "Failed to get public key in affine coordinates.\n"); - return false; - } + ASSERT_TRUE(x); + ASSERT_TRUE(y); + ASSERT_TRUE(EC_POINT_get_affine_coordinates_GFp( + EC_KEY_get0_group(key.get()), pub_key, x.get(), y.get(), NULL)); bssl::UniquePtr<char> x_hex(BN_bn2hex(x.get())); bssl::UniquePtr<char> y_hex(BN_bn2hex(y.get())); - if (!x_hex || !y_hex) { - return false; - } - if (0 != strcmp( - x_hex.get(), - "c81561ecf2e54edefe6617db1c7a34a70744ddb261f269b83dacfcd2ade5a681") || - 0 != strcmp( - y_hex.get(), - "e0e2afa3f9b6abe4c698ef6495f1be49a3196c5056acb3763fe4507eec596e88")) { - fprintf(stderr, "Incorrect public key: %s %s\n", x_hex.get(), y_hex.get()); - return false; - } - - return true; + ASSERT_TRUE(x_hex); + ASSERT_TRUE(y_hex); + + EXPECT_STREQ( + "c81561ecf2e54edefe6617db1c7a34a70744ddb261f269b83dacfcd2ade5a681", + x_hex.get()); + EXPECT_STREQ( + "e0e2afa3f9b6abe4c698ef6495f1be49a3196c5056acb3763fe4507eec596e88", + y_hex.get()); } -static bool TestZeroPadding() { +TEST(ECTest, ZeroPadding) { // Check that the correct encoding round-trips. - bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeyWithZeros, - sizeof(kECKeyWithZeros)); + bssl::UniquePtr<EC_KEY> key = + DecodeECPrivateKey(kECKeyWithZeros, sizeof(kECKeyWithZeros)); + ASSERT_TRUE(key); std::vector<uint8_t> out; - if (!key || !EncodeECPrivateKey(&out, key.get())) { - ERR_print_errors_fp(stderr); - return false; - } - - if (std::vector<uint8_t>(kECKeyWithZeros, - kECKeyWithZeros + sizeof(kECKeyWithZeros)) != out) { - fprintf(stderr, "Serialisation of key was incorrect.\n"); - return false; - } + EXPECT_TRUE(EncodeECPrivateKey(&out, key.get())); + EXPECT_EQ(Bytes(kECKeyWithZeros), Bytes(out.data(), out.size())); // Keys without leading zeros also parse, but they encode correctly. key = DecodeECPrivateKey(kECKeyMissingZeros, sizeof(kECKeyMissingZeros)); - if (!key || !EncodeECPrivateKey(&out, key.get())) { - ERR_print_errors_fp(stderr); - return false; - } - - if (std::vector<uint8_t>(kECKeyWithZeros, - kECKeyWithZeros + sizeof(kECKeyWithZeros)) != out) { - fprintf(stderr, "Serialisation of key was incorrect.\n"); - return false; - } - - return true; + ASSERT_TRUE(key); + EXPECT_TRUE(EncodeECPrivateKey(&out, key.get())); + EXPECT_EQ(Bytes(kECKeyWithZeros), Bytes(out.data(), out.size())); } -static bool TestSpecifiedCurve() { +TEST(ECTest, SpecifiedCurve) { // Test keys with specified curves may be decoded. bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeySpecifiedCurve, sizeof(kECKeySpecifiedCurve)); - if (!key) { - ERR_print_errors_fp(stderr); - return false; - } + ASSERT_TRUE(key); // The group should have been interpreted as P-256. - if (EC_GROUP_get_curve_name(EC_KEY_get0_group(key.get())) != - NID_X9_62_prime256v1) { - fprintf(stderr, "Curve name incorrect.\n"); - return false; - } + EXPECT_EQ(NID_X9_62_prime256v1, + EC_GROUP_get_curve_name(EC_KEY_get0_group(key.get()))); // Encoding the key should still use named form. std::vector<uint8_t> out; - if (!EncodeECPrivateKey(&out, key.get())) { - ERR_print_errors_fp(stderr); - return false; - } - if (std::vector<uint8_t>(kECKeyWithoutPublic, - kECKeyWithoutPublic + sizeof(kECKeyWithoutPublic)) != - out) { - fprintf(stderr, "Serialisation of key was incorrect.\n"); - return false; - } - - return true; -} - -static bool TestSetAffine(const int nid) { - bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid)); - if (!key) { - return false; - } - - const EC_GROUP *const group = EC_KEY_get0_group(key.get()); - - if (!EC_KEY_generate_key(key.get())) { - fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid); - ERR_print_errors_fp(stderr); - return false; - } - - if (!EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()), - nullptr)) { - fprintf(stderr, "generated point is not on curve with nid %d", nid); - ERR_print_errors_fp(stderr); - return false; - } - - bssl::UniquePtr<BIGNUM> x(BN_new()); - bssl::UniquePtr<BIGNUM> y(BN_new()); - if (!EC_POINT_get_affine_coordinates_GFp(group, - EC_KEY_get0_public_key(key.get()), - x.get(), y.get(), nullptr)) { - fprintf(stderr, "EC_POINT_get_affine_coordinates_GFp failed with nid %d\n", - nid); - ERR_print_errors_fp(stderr); - return false; - } - - auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group)); - if (!point) { - return false; - } - - if (!EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(), y.get(), - nullptr)) { - fprintf(stderr, "EC_POINT_set_affine_coordinates_GFp failed with nid %d\n", - nid); - ERR_print_errors_fp(stderr); - return false; - } - - // Subtract one from |y| to make the point no longer on the curve. - if (!BN_sub(y.get(), y.get(), BN_value_one())) { - return false; - } - - bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group)); - if (!invalid_point) { - return false; - } - - if (EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(), x.get(), - y.get(), nullptr)) { - fprintf(stderr, - "EC_POINT_set_affine_coordinates_GFp succeeded with invalid " - "coordinates with nid %d\n", - nid); - ERR_print_errors_fp(stderr); - return false; - } - - return true; + EXPECT_TRUE(EncodeECPrivateKey(&out, key.get())); + EXPECT_EQ(Bytes(kECKeyWithoutPublic), Bytes(out.data(), out.size())); } -static bool TestArbitraryCurve() { +TEST(ECTest, ArbitraryCurve) { // Make a P-256 key and extract the affine coordinates. bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_X9_62_prime256v1)); - if (!key || !EC_KEY_generate_key(key.get())) { - return false; - } + ASSERT_TRUE(key); + ASSERT_TRUE(EC_KEY_generate_key(key.get())); // Make an arbitrary curve which is identical to P-256. static const uint8_t kP[] = { @@ -351,186 +230,161 @@ static bool TestArbitraryCurve() { 0x9e, 0x84, 0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51, }; bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new()); + ASSERT_TRUE(ctx); bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr)); + ASSERT_TRUE(p); bssl::UniquePtr<BIGNUM> a(BN_bin2bn(kA, sizeof(kA), nullptr)); + ASSERT_TRUE(a); bssl::UniquePtr<BIGNUM> b(BN_bin2bn(kB, sizeof(kB), nullptr)); + ASSERT_TRUE(b); bssl::UniquePtr<BIGNUM> gx(BN_bin2bn(kX, sizeof(kX), nullptr)); + ASSERT_TRUE(gx); bssl::UniquePtr<BIGNUM> gy(BN_bin2bn(kY, sizeof(kY), nullptr)); + ASSERT_TRUE(gy); bssl::UniquePtr<BIGNUM> order(BN_bin2bn(kOrder, sizeof(kOrder), nullptr)); - bssl::UniquePtr<BIGNUM> cofactor(BN_new()); - if (!ctx || !p || !a || !b || !gx || !gy || !order || !cofactor || - !BN_set_word(cofactor.get(), 1)) { - return false; - } + ASSERT_TRUE(order); bssl::UniquePtr<EC_GROUP> group( EC_GROUP_new_curve_GFp(p.get(), a.get(), b.get(), ctx.get())); - if (!group) { - return false; - } + ASSERT_TRUE(group); bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get())); - if (!generator || - !EC_POINT_set_affine_coordinates_GFp(group.get(), generator.get(), - gx.get(), gy.get(), ctx.get()) || - !EC_GROUP_set_generator(group.get(), generator.get(), order.get(), - cofactor.get())) { - return false; - } + ASSERT_TRUE(generator); + ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp( + group.get(), generator.get(), gx.get(), gy.get(), ctx.get())); + ASSERT_TRUE(EC_GROUP_set_generator(group.get(), generator.get(), order.get(), + BN_value_one())); // |group| should not have a curve name. - if (EC_GROUP_get_curve_name(group.get()) != NID_undef) { - return false; - } + EXPECT_EQ(NID_undef, EC_GROUP_get_curve_name(group.get())); // Copy |key| to |key2| using |group|. bssl::UniquePtr<EC_KEY> key2(EC_KEY_new()); + ASSERT_TRUE(key2); bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get())); + ASSERT_TRUE(point); bssl::UniquePtr<BIGNUM> x(BN_new()), y(BN_new()); - if (!key2 || !point || !x || !y || - !EC_KEY_set_group(key2.get(), group.get()) || - !EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())) || - !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()), - EC_KEY_get0_public_key(key.get()), - x.get(), y.get(), nullptr) || - !EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(), x.get(), - y.get(), nullptr) || - !EC_KEY_set_public_key(key2.get(), point.get())) { - fprintf(stderr, "Could not copy key.\n"); - return false; - } + ASSERT_TRUE(x); + ASSERT_TRUE(EC_KEY_set_group(key2.get(), group.get())); + ASSERT_TRUE( + EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get()))); + ASSERT_TRUE(EC_POINT_get_affine_coordinates_GFp( + EC_KEY_get0_group(key.get()), EC_KEY_get0_public_key(key.get()), x.get(), + y.get(), nullptr)); + ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(), + x.get(), y.get(), nullptr)); + ASSERT_TRUE(EC_KEY_set_public_key(key2.get(), point.get())); // The key must be valid according to the new group too. - if (!EC_KEY_check_key(key2.get())) { - fprintf(stderr, "Copied key is not valid.\n"); - return false; - } - - return true; + EXPECT_TRUE(EC_KEY_check_key(key2.get())); } -static bool TestAddingEqualPoints(int nid) { - bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid)); - if (!key) { - return false; - } +class ECCurveTest : public testing::TestWithParam<EC_builtin_curve> {}; + +TEST_P(ECCurveTest, SetAffine) { + // Generate an EC_KEY. + bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid)); + ASSERT_TRUE(key); + ASSERT_TRUE(EC_KEY_generate_key(key.get())); const EC_GROUP *const group = EC_KEY_get0_group(key.get()); + EXPECT_TRUE( + EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()), nullptr)); - if (!EC_KEY_generate_key(key.get())) { - fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid); - ERR_print_errors_fp(stderr); - return false; - } + // Get the public key's coordinates. + bssl::UniquePtr<BIGNUM> x(BN_new()); + ASSERT_TRUE(x); + bssl::UniquePtr<BIGNUM> y(BN_new()); + ASSERT_TRUE(y); + EXPECT_TRUE(EC_POINT_get_affine_coordinates_GFp( + group, EC_KEY_get0_public_key(key.get()), x.get(), y.get(), nullptr)); + + // Points on the curve should be accepted. + auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group)); + ASSERT_TRUE(point); + EXPECT_TRUE(EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(), + y.get(), nullptr)); + + // Subtract one from |y| to make the point no longer on the curve. + EXPECT_TRUE(BN_sub(y.get(), y.get(), BN_value_one())); + + // Points not on the curve should be rejected. + bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group)); + ASSERT_TRUE(invalid_point); + EXPECT_FALSE(EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(), + x.get(), y.get(), nullptr)); +} + +TEST_P(ECCurveTest, AddingEqualPoints) { + bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid)); + ASSERT_TRUE(key); + ASSERT_TRUE(EC_KEY_generate_key(key.get())); + + const EC_GROUP *const group = EC_KEY_get0_group(key.get()); bssl::UniquePtr<EC_POINT> p1(EC_POINT_new(group)); - bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group)); - bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group)); - bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group)); - if (!p1 || !p2 || !double_p1 || !p1_plus_p2) { - return false; - } + ASSERT_TRUE(p1); + ASSERT_TRUE(EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get()))); - if (!EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())) || - !EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get()))) { - fprintf(stderr, "EC_POINT_COPY failed with nid %d\n", nid); - ERR_print_errors_fp(stderr); - return false; - } + bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group)); + ASSERT_TRUE(p2); + ASSERT_TRUE(EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get()))); + bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group)); + ASSERT_TRUE(double_p1); bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new()); - if (!ctx) { - return false; - } - - if (!EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()) || - !EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get())) { - fprintf(stderr, "Point operation failed with nid %d\n", nid); - ERR_print_errors_fp(stderr); - return false; - } + ASSERT_TRUE(ctx); + ASSERT_TRUE(EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get())); - if (EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()) != 0) { - fprintf(stderr, "A+A != 2A for nid %d", nid); - return false; - } + bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group)); + ASSERT_TRUE(p1_plus_p2); + ASSERT_TRUE( + EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get())); - return true; + EXPECT_EQ(0, + EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get())) + << "A+A != 2A"; } -static bool TestMulZero(int nid) { - bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(nid)); - if (!group) { - return false; - } +TEST_P(ECCurveTest, MulZero) { + bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid)); + ASSERT_TRUE(group); bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get())); + ASSERT_TRUE(point); bssl::UniquePtr<BIGNUM> zero(BN_new()); - if (!point || !zero) { - return false; - } - + ASSERT_TRUE(zero); BN_zero(zero.get()); - if (!EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr, nullptr, - nullptr)) { - return false; - } + ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr, + nullptr, nullptr)); - if (!EC_POINT_is_at_infinity(group.get(), point.get())) { - fprintf(stderr, "g * 0 did not return point at infinity.\n"); - return false; - } + EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get())) + << "g * 0 did not return point at infinity."; // Test that zero times an arbitrary point is also infinity. The generator is // used as the arbitrary point. bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get())); - bssl::UniquePtr<BIGNUM> one(BN_new()); - if (!generator || - !one || - !BN_one(one.get()) || - !EC_POINT_mul(group.get(), generator.get(), one.get(), nullptr, nullptr, - nullptr) || - !EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(), - zero.get(), nullptr)) { - return false; - } - - if (!EC_POINT_is_at_infinity(group.get(), point.get())) { - fprintf(stderr, "p * 0 did not return point at infinity.\n"); - return false; - } - - return true; + ASSERT_TRUE(generator); + ASSERT_TRUE(EC_POINT_mul(group.get(), generator.get(), BN_value_one(), + nullptr, nullptr, nullptr)); + ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(), + zero.get(), nullptr)); + + EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get())) + << "p * 0 did not return point at infinity."; } -static bool ForEachCurve(bool (*test_func)(int nid)) { +static std::vector<EC_builtin_curve> AllCurves() { const size_t num_curves = EC_get_builtin_curves(nullptr, 0); std::vector<EC_builtin_curve> curves(num_curves); EC_get_builtin_curves(curves.data(), num_curves); - - for (const auto& curve : curves) { - if (!test_func(curve.nid)) { - fprintf(stderr, "Test failed for %s\n", curve.comment); - return false; - } - } - - return true; + return curves; } -int main() { - CRYPTO_library_init(); - - if (!Testd2i_ECPrivateKey() || - !TestZeroPadding() || - !TestSpecifiedCurve() || - !ForEachCurve(TestSetAffine) || - !ForEachCurve(TestAddingEqualPoints) || - !ForEachCurve(TestMulZero) || - !TestArbitraryCurve()) { - fprintf(stderr, "failed\n"); - return 1; - } - - printf("PASS\n"); - return 0; +static std::string CurveToString( + const testing::TestParamInfo<EC_builtin_curve> ¶ms) { + // The comment field contains characters GTest rejects, so use the OBJ name. + return OBJ_nid2sn(params.param.nid); } + +INSTANTIATE_TEST_CASE_P(, ECCurveTest, testing::ValuesIn(AllCurves()), + CurveToString); diff --git a/src/crypto/ecdsa/ecdsa.c b/src/crypto/ecdsa/ecdsa.c index 34320819..e1a0525f 100644 --- a/src/crypto/ecdsa/ecdsa.c +++ b/src/crypto/ecdsa/ecdsa.c @@ -66,9 +66,10 @@ int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, - unsigned int *sig_len, EC_KEY *eckey) { + unsigned int *sig_len, const EC_KEY *eckey) { if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { - return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len, eckey); + return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len, + (EC_KEY*) eckey /* cast away const */); } return ECDSA_sign_ex(type, digest, digest_len, sig, sig_len, NULL, NULL, @@ -76,7 +77,7 @@ int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, } int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len, - const uint8_t *sig, size_t sig_len, EC_KEY *eckey) { + const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) { ECDSA_SIG *s; int ret = 0; uint8_t *der = NULL; @@ -133,12 +134,12 @@ static int digest_to_bn(BIGNUM *out, const uint8_t *digest, size_t digest_len, } ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, - EC_KEY *key) { + const EC_KEY *key) { return ECDSA_do_sign_ex(digest, digest_len, NULL, NULL, key); } int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, EC_KEY *eckey) { + const ECDSA_SIG *sig, const EC_KEY *eckey) { int ret = 0; BN_CTX *ctx; BIGNUM *u1, *u2, *m, *X; @@ -224,7 +225,7 @@ err: return ret; } -static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, +static int ecdsa_sign_setup(const EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp, const uint8_t *digest, size_t digest_len) { BN_CTX *ctx = NULL; @@ -338,13 +339,14 @@ err: return ret; } -int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv, BIGNUM **rp) { +int ECDSA_sign_setup(const EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv, + BIGNUM **rp) { return ecdsa_sign_setup(eckey, ctx, kinv, rp, NULL, 0); } ECDSA_SIG *ECDSA_do_sign_ex(const uint8_t *digest, size_t digest_len, const BIGNUM *in_kinv, const BIGNUM *in_r, - EC_KEY *eckey) { + const EC_KEY *eckey) { int ok = 0; BIGNUM *kinv = NULL, *s, *m = NULL, *tmp = NULL; const BIGNUM *ckinv; @@ -441,7 +443,7 @@ err: int ECDSA_sign_ex(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, unsigned int *sig_len, const BIGNUM *kinv, - const BIGNUM *r, EC_KEY *eckey) { + const BIGNUM *r, const EC_KEY *eckey) { int ret = 0; ECDSA_SIG *s = NULL; diff --git a/src/crypto/evp/evp_ctx.c b/src/crypto/evp/evp_ctx.c index 905aae91..a17a8ccc 100644 --- a/src/crypto/evp/evp_ctx.c +++ b/src/crypto/evp/evp_ctx.c @@ -148,48 +148,40 @@ void EVP_PKEY_CTX_free(EVP_PKEY_CTX *ctx) { OPENSSL_free(ctx); } -EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *pctx) { - EVP_PKEY_CTX *rctx; - - if (!pctx->pmeth || !pctx->pmeth->copy) { +EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *ctx) { + if (!ctx->pmeth || !ctx->pmeth->copy) { return NULL; } - rctx = OPENSSL_malloc(sizeof(EVP_PKEY_CTX)); - if (!rctx) { + EVP_PKEY_CTX *ret = OPENSSL_malloc(sizeof(EVP_PKEY_CTX)); + if (!ret) { return NULL; } - OPENSSL_memset(rctx, 0, sizeof(EVP_PKEY_CTX)); + OPENSSL_memset(ret, 0, sizeof(EVP_PKEY_CTX)); - rctx->pmeth = pctx->pmeth; - rctx->engine = pctx->engine; - rctx->operation = pctx->operation; + ret->pmeth = ctx->pmeth; + ret->engine = ctx->engine; + ret->operation = ctx->operation; - if (pctx->pkey) { - EVP_PKEY_up_ref(pctx->pkey); - rctx->pkey = pctx->pkey; - if (rctx->pkey == NULL) { - goto err; - } + if (ctx->pkey != NULL) { + EVP_PKEY_up_ref(ctx->pkey); + ret->pkey = ctx->pkey; } - if (pctx->peerkey) { - EVP_PKEY_up_ref(pctx->peerkey); - rctx->peerkey = pctx->peerkey; - if (rctx->peerkey == NULL) { - goto err; - } + if (ctx->peerkey != NULL) { + EVP_PKEY_up_ref(ctx->peerkey); + ret->peerkey = ctx->peerkey; } - if (pctx->pmeth->copy(rctx, pctx) > 0) { - return rctx; + if (ctx->pmeth->copy(ret, ctx) <= 0) { + ret->pmeth = NULL; + EVP_PKEY_CTX_free(ret); + OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); + return NULL; } -err: - EVP_PKEY_CTX_free(rctx); - OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); - return NULL; + return ret; } EVP_PKEY *EVP_PKEY_CTX_get0_pkey(EVP_PKEY_CTX *ctx) { return ctx->pkey; } diff --git a/src/crypto/md5/asm/md5-586.pl b/src/crypto/md5/asm/md5-586.pl index a237b0cd..a032d9ba 100644 --- a/src/crypto/md5/asm/md5-586.pl +++ b/src/crypto/md5/asm/md5-586.pl @@ -50,7 +50,7 @@ sub R0 local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_; &mov($tmp1,$C) if $pos < 0; - &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one + &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one # body proper diff --git a/src/crypto/modes/asm/aesni-gcm-x86_64.pl b/src/crypto/modes/asm/aesni-gcm-x86_64.pl index e329741c..139014fa 100644 --- a/src/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/src/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -22,10 +22,11 @@ # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per -# byte processed with 128-bit key on Haswell processor, and 0.74 - -# on Broadwell. [Mentioned results are raw profiled measurements for -# favourable packet size, one divisible by 96. Applications using the -# EVP interface will observe a few percent worse performance.] +# byte processed with 128-bit key on Haswell processor, 0.74 - on +# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled +# measurements for favourable packet size, one divisible by 96. +# Applications using the EVP interface will observe a few percent +# worse performance.] # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf diff --git a/src/crypto/modes/asm/ghash-armv4.pl b/src/crypto/modes/asm/ghash-armv4.pl index 299eedcb..1a03251e 100644 --- a/src/crypto/modes/asm/ghash-armv4.pl +++ b/src/crypto/modes/asm/ghash-armv4.pl @@ -47,7 +47,7 @@ # # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. -# +# # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf # ==================================================================== @@ -486,7 +486,7 @@ $code.=<<___; #ifdef __ARMEL__ vrev64.8 $Xl,$Xl #endif - sub $Xi,#16 + sub $Xi,#16 vst1.64 $Xl#hi,[$Xi]! @ write out Xi vst1.64 $Xl#lo,[$Xi] diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl index 182c29a3..d3a79e14 100644 --- a/src/crypto/modes/asm/ghash-x86.pl +++ b/src/crypto/modes/asm/ghash-x86.pl @@ -88,7 +88,7 @@ # where Tproc is time required for Karatsuba pre- and post-processing, # is more realistic estimate. In this case it gives ... 1.91 cycles. # Or in other words, depending on how well we can interleave reduction -# and one of the two multiplications the performance should be betwen +# and one of the two multiplications the performance should be between # 1.91 and 2.16. As already mentioned, this implementation processes # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart # - in 2.02. x86_64 performance is better, because larger register @@ -487,7 +487,7 @@ sub mmx_loop() { &pxor ($red[1],$red[1]); &pxor ($red[2],$red[2]); - # Just like in "May" verson modulo-schedule for critical path in + # Just like in "May" version modulo-schedule for critical path in # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' # is scheduled so late that rem_8bit[] has to be shifted *right* # by 16, which is why last argument to pinsrw is 2, which @@ -576,7 +576,7 @@ sub mmx_loop() { &bswap ($dat); &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 &bswap ("ebx"); - + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? &jne (&label("outer")); } @@ -680,7 +680,7 @@ my ($Xhi,$Xi) = @_; &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T1,8); # + &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # @@ -850,7 +850,7 @@ my ($Xhi,$Xi) = @_; &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T1,8); # + &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # &pshufd ($T1,$Xhn,0b01001110); @@ -913,7 +913,7 @@ my ($Xhi,$Xi) = @_; &movdqu (&QWP(0,$Xip),$Xi); &function_end("gcm_ghash_clmul"); -} else { # Algorith 5. Kept for reference purposes. +} else { # Algorithm 5. Kept for reference purposes. sub reduction_alg5 { # 19/16 times faster than Intel version my ($Xhi,$Xi)=@_; diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl index d7471e27..0e6e3489 100644 --- a/src/crypto/modes/asm/ghash-x86_64.pl +++ b/src/crypto/modes/asm/ghash-x86_64.pl @@ -64,8 +64,10 @@ # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) +# Goldmont 1.08(+24%) # March 2013 # @@ -74,8 +76,8 @@ # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that -# it performs in 0.41 cycles per byte on Haswell processor, and in -# 0.29 on Broadwell. +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest @@ -217,8 +219,12 @@ $code=<<___; .align 16 gcm_gmult_4bit: push %rbx - push %rbp # %rbp and %r12 are pushed exclusively in + push %rbp # %rbp and others are pushed exclusively in push %r12 # order to reuse Win64 exception handler... + push %r13 + push %r14 + push %r15 + sub \$280,%rsp .Lgmult_prologue: movzb 15($Xi),$Zlo @@ -229,8 +235,9 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 16(%rsp),%rbx - lea 24(%rsp),%rsp + lea 280+48(%rsp),%rsi + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lgmult_epilogue: ret .size gcm_gmult_4bit,.-gcm_gmult_4bit @@ -380,14 +387,14 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - lea 280(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + lea 280+48(%rsp),%rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea 0(%rsi),%rsp .Lghash_epilogue: ret .size gcm_ghash_4bit,.-gcm_ghash_4bit @@ -449,7 +456,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # @@ -563,7 +570,7 @@ ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there - # no dependency between the two multiplications... + # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff mov \$0x07,%r11d @@ -738,7 +745,7 @@ $code.=<<___; movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 @@ -874,7 +881,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # @@ -1628,14 +1635,20 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - lea 24(%rax),%rax # adjust "rsp" + lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi diff --git a/src/crypto/perlasm/ppc-xlate.pl b/src/crypto/perlasm/ppc-xlate.pl index 55b02bca..de796d73 100644 --- a/src/crypto/perlasm/ppc-xlate.pl +++ b/src/crypto/perlasm/ppc-xlate.pl @@ -36,7 +36,7 @@ my $globl = sub { my $ret; $name =~ s|^\.||; - + SWITCH: for ($flavour) { /aix/ && do { if (!$$type) { $$type = "\@function"; diff --git a/src/crypto/perlasm/readme b/src/crypto/perlasm/readme index 648537b9..57d2083c 100644 --- a/src/crypto/perlasm/readme +++ b/src/crypto/perlasm/readme @@ -7,7 +7,7 @@ and then include it. push(@INC,"perlasm","../../perlasm"); require "x86asm.pl"; -The first thing we do is setup the file and type of assember +The first thing we do is setup the file and type of assembler &asm_init($ARGV[0],$0); @@ -18,7 +18,7 @@ Argument 2 is the file name. The reciprocal function is &asm_finish() which should be called at the end. -There are 2 main 'packages'. x86ms.pl, which is the microsoft assembler, +There are 2 main 'packages'. x86ms.pl, which is the Microsoft assembler, and x86unix.pl which is the unix (gas) version. Functions of interest are: @@ -32,7 +32,7 @@ Functions of interest are: &function_begin(name,extra) Start a function with pushing of edi, esi, ebx and ebp. extra is extra win32 external info that may be required. -&function_begin_B(name,extra) Same as norma function_begin but no pushing. +&function_begin_B(name,extra) Same as normal function_begin but no pushing. &function_end(name) Call at end of function. &function_end_A(name) Standard pop and ret, for use inside functions &function_end_B(name) Call at end but with poping or 'ret'. diff --git a/src/crypto/perlasm/x86_64-xlate.pl b/src/crypto/perlasm/x86_64-xlate.pl index 16553f2a..6e487b8e 100755 --- a/src/crypto/perlasm/x86_64-xlate.pl +++ b/src/crypto/perlasm/x86_64-xlate.pl @@ -141,7 +141,7 @@ my %globals; if ($gas) { if ($self->{op} eq "movz") { # movz is pain... sprintf "%s%s%s",$self->{op},$self->{sz},shift; - } elsif ($self->{op} =~ /^set/) { + } elsif ($self->{op} =~ /^set/) { "$self->{op}"; } elsif ($self->{op} eq "ret") { my $epilogue = ""; @@ -168,7 +168,7 @@ my %globals; $self->{op} .= $self->{sz}; } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { $self->{op} = "\tDQ"; - } + } $self->{op}; } } @@ -274,7 +274,7 @@ my %globals; } # if base register is %rbp or %r13, see if it's possible to - # flip base and ingex registers [for better performance] + # flip base and index registers [for better performance] if (!$self->{label} && $self->{index} && $self->{scale}==1 && $self->{base} =~ /(rbp|r13)/) { $self->{base} = $self->{index}; $self->{index} = $1; @@ -432,7 +432,7 @@ my %globals; } } } -{ package expr; # pick up expressioins +{ package expr; # pick up expressions sub re { my ($class, $line, $opcode) = @_; my $self = {}; @@ -460,6 +460,242 @@ my %globals; } } } +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example avaiable at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_rsp); + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significat digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; }; + /endproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", 0); last; }; + /def_cfa_register/ + && do { $cfa_reg = $$line; last; }; + /def_cfa_offset/ + && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp"); + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return ($elf ? $self->{value} : undef); + } +} { package directive; # pick up directives, which start with . sub re { my ($class, $line) = @_; @@ -467,6 +703,9 @@ my %globals; my $ret; my $dir; + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + if ($$line =~ /^\s*(\.\w+)/) { bless $self,$class; $dir = $1; @@ -644,7 +883,7 @@ my %globals; if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) { $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } $var; - }; + }; $sz =~ tr/bvlrq/BWDDQ/; $self->{value} = "\tD$sz\t"; @@ -654,7 +893,7 @@ my %globals; }; /\.byte/ && do { my @str=split(/,\s*/,$$line); map(s/(0b[0-1]+)/oct($1)/eig,@str); - map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); while ($#str>15) { $self->{value}.="DB\t" .join(",",@str[0..15])."\n"; @@ -810,7 +1049,7 @@ my $rdrand = sub { my @opcode=(); my $dst=$1; if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,0,$1,8); + rex(\@opcode,0,$dst,8); push @opcode,0x0f,0xc7,0xf0|($dst&7); @opcode; } else { @@ -823,7 +1062,7 @@ my $rdseed = sub { my @opcode=(); my $dst=$1; if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,0,$1,8); + rex(\@opcode,0,$dst,8); push @opcode,0x0f,0xc7,0xf8|($dst&7); @opcode; } else { @@ -912,7 +1151,7 @@ while(defined(my $line=<>)) { printf "%s",$directive->out(); } elsif (my $opcode=opcode->re(\$line)) { my $asm = eval("\$".$opcode->mnemonic()); - + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; next; @@ -998,7 +1237,7 @@ close STDOUT; # %r13 - - # %r14 - - # %r15 - - -# +# # (*) volatile register # (-) preserved by callee # (#) Nth argument, volatile @@ -1021,7 +1260,7 @@ close STDOUT; # the area above user stack pointer in true asynchronous manner... # # All the above means that if assembler programmer adheres to Unix -# register and stack layout, but disregards the "red zone" existense, +# register and stack layout, but disregards the "red zone" existence, # it's possible to use following prologue and epilogue to "gear" from # Unix to Win64 ABI in leaf functions with not more than 6 arguments. # diff --git a/src/crypto/perlasm/x86nasm.pl b/src/crypto/perlasm/x86nasm.pl index d159514e..d3773b68 100644 --- a/src/crypto/perlasm/x86nasm.pl +++ b/src/crypto/perlasm/x86nasm.pl @@ -140,7 +140,7 @@ ___ grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out; push (@out,$comm) } - push (@out,$initseg) if ($initseg); + push (@out,$initseg) if ($initseg); } sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } } diff --git a/src/crypto/pkcs8/pkcs8.c b/src/crypto/pkcs8/pkcs8.c index efad81d4..64a2d021 100644 --- a/src/crypto/pkcs8/pkcs8.c +++ b/src/crypto/pkcs8/pkcs8.c @@ -426,26 +426,9 @@ err: return ret; } -PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass, - int pass_len) { - uint8_t *pass_raw = NULL; - size_t pass_raw_len = 0; - if (!pass_to_pass_raw(OBJ_obj2nid(pkcs8->algor->algorithm), pass, pass_len, - &pass_raw, &pass_raw_len)) { - return NULL; - } - - PKCS8_PRIV_KEY_INFO *ret = PKCS8_decrypt_pbe(pkcs8, pass_raw, pass_raw_len); - - if (pass_raw) { - OPENSSL_cleanse(pass_raw, pass_raw_len); - OPENSSL_free(pass_raw); - } - return ret; -} - -PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8, const uint8_t *pass_raw, - size_t pass_raw_len) { +static PKCS8_PRIV_KEY_INFO *pkcs8_decrypt_raw(X509_SIG *pkcs8, + const uint8_t *pass_raw, + size_t pass_raw_len) { PKCS8_PRIV_KEY_INFO *ret = NULL; uint8_t *in = NULL, *out = NULL; size_t out_len = 0; @@ -495,17 +478,16 @@ err: return ret; } -X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass, - int pass_len, const uint8_t *salt, size_t salt_len, - int iterations, PKCS8_PRIV_KEY_INFO *p8inf) { +PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass, + int pass_len) { uint8_t *pass_raw = NULL; size_t pass_raw_len = 0; - if (!pass_to_pass_raw(pbe_nid, pass, pass_len, &pass_raw, &pass_raw_len)) { + if (!pass_to_pass_raw(OBJ_obj2nid(pkcs8->algor->algorithm), pass, pass_len, + &pass_raw, &pass_raw_len)) { return NULL; } - X509_SIG *ret = PKCS8_encrypt_pbe(pbe_nid, cipher, pass_raw, pass_raw_len, - salt, salt_len, iterations, p8inf); + PKCS8_PRIV_KEY_INFO *ret = pkcs8_decrypt_raw(pkcs8, pass_raw, pass_raw_len); if (pass_raw) { OPENSSL_cleanse(pass_raw, pass_raw_len); @@ -514,10 +496,10 @@ X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass, return ret; } -X509_SIG *PKCS8_encrypt_pbe(int pbe_nid, const EVP_CIPHER *cipher, - const uint8_t *pass_raw, size_t pass_raw_len, - const uint8_t *salt, size_t salt_len, - int iterations, PKCS8_PRIV_KEY_INFO *p8inf) { +static X509_SIG *pkcs8_encrypt_raw(int pbe_nid, const EVP_CIPHER *cipher, + const uint8_t *pass_raw, size_t pass_raw_len, + const uint8_t *salt, size_t salt_len, + int iterations, PKCS8_PRIV_KEY_INFO *p8inf) { X509_SIG *ret = NULL; uint8_t *plaintext = NULL, *salt_buf = NULL, *der = NULL; int plaintext_len = -1; @@ -609,6 +591,25 @@ err: return ret; } +X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass, + int pass_len, const uint8_t *salt, size_t salt_len, + int iterations, PKCS8_PRIV_KEY_INFO *p8inf) { + uint8_t *pass_raw = NULL; + size_t pass_raw_len = 0; + if (!pass_to_pass_raw(pbe_nid, pass, pass_len, &pass_raw, &pass_raw_len)) { + return NULL; + } + + X509_SIG *ret = pkcs8_encrypt_raw(pbe_nid, cipher, pass_raw, pass_raw_len, + salt, salt_len, iterations, p8inf); + + if (pass_raw) { + OPENSSL_cleanse(pass_raw, pass_raw_len); + OPENSSL_free(pass_raw); + } + return ret; +} + EVP_PKEY *EVP_PKCS82PKEY(PKCS8_PRIV_KEY_INFO *p8) { uint8_t *der = NULL; int der_len = i2d_PKCS8_PRIV_KEY_INFO(p8, &der); @@ -758,7 +759,7 @@ static int PKCS12_handle_safe_bag(CBS *safe_bag, struct pkcs12_context *ctx) { } PKCS8_PRIV_KEY_INFO *pki = - PKCS8_decrypt_pbe(encrypted, ctx->password, ctx->password_len); + pkcs8_decrypt_raw(encrypted, ctx->password, ctx->password_len); X509_SIG_free(encrypted); if (pki == NULL) { return 0; diff --git a/src/crypto/rsa/CMakeLists.txt b/src/crypto/rsa/CMakeLists.txt index 969b753e..76937c1e 100644 --- a/src/crypto/rsa/CMakeLists.txt +++ b/src/crypto/rsa/CMakeLists.txt @@ -11,14 +11,3 @@ add_library( padding.c rsa_asn1.c ) - -add_executable( - rsa_test - - rsa_test.cc - - $<TARGET_OBJECTS:test_support> -) - -target_link_libraries(rsa_test crypto) -add_dependencies(all_tests rsa_test)
\ No newline at end of file diff --git a/src/crypto/rsa/rsa_test.cc b/src/crypto/rsa/rsa_test.cc index 306df7e3..401efdf4 100644 --- a/src/crypto/rsa/rsa_test.cc +++ b/src/crypto/rsa/rsa_test.cc @@ -59,6 +59,8 @@ #include <stdlib.h> #include <string.h> +#include <gtest/gtest.h> + #include <openssl/bn.h> #include <openssl/bytestring.h> #include <openssl/crypto.h> @@ -66,6 +68,7 @@ #include <openssl/nid.h> #include "../internal.h" +#include "../test/test_util.h" // kPlaintext is a sample plaintext. @@ -523,191 +526,172 @@ static const uint8_t kExponent1RSAKey[] = { 0xdd, 0x02, 0x01, 0x01, }; -static bool TestRSA(const uint8_t *der, size_t der_len, - const uint8_t *oaep_ciphertext, - size_t oaep_ciphertext_len) { - bssl::UniquePtr<RSA> key(RSA_private_key_from_bytes(der, der_len)); - if (!key) { - return false; - } +struct RSAEncryptParam { + const uint8_t *der; + size_t der_len; + const uint8_t *oaep_ciphertext; + size_t oaep_ciphertext_len; +} kRSAEncryptParams[] = { + {kKey1, sizeof(kKey1) - 1, kOAEPCiphertext1, sizeof(kOAEPCiphertext1) - 1}, + {kKey2, sizeof(kKey2) - 1, kOAEPCiphertext2, sizeof(kOAEPCiphertext2) - 1}, + {kKey3, sizeof(kKey3) - 1, kOAEPCiphertext3, sizeof(kOAEPCiphertext3) - 1}, +}; - if (!RSA_check_key(key.get())) { - fprintf(stderr, "RSA_check_key failed\n"); - return false; - } +class RSAEncryptTest : public testing::TestWithParam<RSAEncryptParam> {}; + +TEST_P(RSAEncryptTest, TestKey) { + const auto ¶m = GetParam(); + bssl::UniquePtr<RSA> key( + RSA_private_key_from_bytes(param.der, param.der_len)); + ASSERT_TRUE(key); + + EXPECT_TRUE(RSA_check_key(key.get())); uint8_t ciphertext[256]; + // Test that PKCS#1 v1.5 encryption round-trips. size_t ciphertext_len = 0; - if (!RSA_encrypt(key.get(), &ciphertext_len, ciphertext, sizeof(ciphertext), - kPlaintext, kPlaintextLen, RSA_PKCS1_PADDING) || - ciphertext_len != RSA_size(key.get())) { - fprintf(stderr, "PKCS#1 v1.5 encryption failed!\n"); - return false; - } + ASSERT_TRUE(RSA_encrypt(key.get(), &ciphertext_len, ciphertext, + sizeof(ciphertext), kPlaintext, kPlaintextLen, + RSA_PKCS1_PADDING)); + EXPECT_EQ(RSA_size(key.get()), ciphertext_len); uint8_t plaintext[256]; size_t plaintext_len = 0; - if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext), - ciphertext, ciphertext_len, RSA_PKCS1_PADDING) || - plaintext_len != kPlaintextLen || - OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) { - fprintf(stderr, "PKCS#1 v1.5 decryption failed!\n"); - return false; - } + ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext, + sizeof(plaintext), ciphertext, ciphertext_len, + RSA_PKCS1_PADDING)); + EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len)); + // Test that OAEP encryption round-trips. ciphertext_len = 0; - if (!RSA_encrypt(key.get(), &ciphertext_len, ciphertext, sizeof(ciphertext), - kPlaintext, kPlaintextLen, RSA_PKCS1_OAEP_PADDING) || - ciphertext_len != RSA_size(key.get())) { - fprintf(stderr, "OAEP encryption failed!\n"); - return false; - } + ASSERT_TRUE(RSA_encrypt(key.get(), &ciphertext_len, ciphertext, + sizeof(ciphertext), kPlaintext, kPlaintextLen, + RSA_PKCS1_OAEP_PADDING)); + EXPECT_EQ(RSA_size(key.get()), ciphertext_len); plaintext_len = 0; - if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext), - ciphertext, ciphertext_len, RSA_PKCS1_OAEP_PADDING) || - plaintext_len != kPlaintextLen || - OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) { - fprintf(stderr, "OAEP decryption (encrypted data) failed!\n"); - return false; - } + ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext, + sizeof(plaintext), ciphertext, ciphertext_len, + RSA_PKCS1_OAEP_PADDING)); + EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len)); // |oaep_ciphertext| should decrypt to |kPlaintext|. plaintext_len = 0; - if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext), - oaep_ciphertext, oaep_ciphertext_len, - RSA_PKCS1_OAEP_PADDING) || - plaintext_len != kPlaintextLen || - OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) { - fprintf(stderr, "OAEP decryption (test vector data) failed!\n"); - return false; - } + ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext, + sizeof(plaintext), param.oaep_ciphertext, + param.oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING)); + EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len)); // Try decrypting corrupted ciphertexts. - OPENSSL_memcpy(ciphertext, oaep_ciphertext, oaep_ciphertext_len); - for (size_t i = 0; i < oaep_ciphertext_len; i++) { + OPENSSL_memcpy(ciphertext, param.oaep_ciphertext, param.oaep_ciphertext_len); + for (size_t i = 0; i < param.oaep_ciphertext_len; i++) { + SCOPED_TRACE(i); ciphertext[i] ^= 1; - if (RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext), - ciphertext, oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING)) { - fprintf(stderr, "Corrupt data decrypted!\n"); - return false; - } + EXPECT_FALSE(RSA_decrypt( + key.get(), &plaintext_len, plaintext, sizeof(plaintext), ciphertext, + param.oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING)); ERR_clear_error(); ciphertext[i] ^= 1; } // Test truncated ciphertexts. - for (size_t len = 0; len < oaep_ciphertext_len; len++) { - if (RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext), - ciphertext, len, RSA_PKCS1_OAEP_PADDING)) { - fprintf(stderr, "Corrupt data decrypted!\n"); - return false; - } + for (size_t len = 0; len < param.oaep_ciphertext_len; len++) { + SCOPED_TRACE(len); + EXPECT_FALSE(RSA_decrypt(key.get(), &plaintext_len, plaintext, + sizeof(plaintext), ciphertext, len, + RSA_PKCS1_OAEP_PADDING)); ERR_clear_error(); } - - return true; } -static bool TestMultiPrimeKey(int nprimes, const uint8_t *der, size_t der_size, - const uint8_t *enc, size_t enc_size) { - bssl::UniquePtr<RSA> rsa(d2i_RSAPrivateKey(nullptr, &der, der_size)); - if (!rsa) { - fprintf(stderr, "%d-prime key failed to parse.\n", nprimes); - ERR_print_errors_fp(stderr); - return false; - } +INSTANTIATE_TEST_CASE_P(, RSAEncryptTest, testing::ValuesIn(kRSAEncryptParams)); + +struct RSAMultiPrimeParam { + const uint8_t *der; + size_t der_size; + const uint8_t *enc; + size_t enc_size; +} kRSAMultiPrimeParams[] = { + {kTwoPrimeKey, sizeof(kTwoPrimeKey) - 1, kTwoPrimeEncryptedMessage, + sizeof(kTwoPrimeEncryptedMessage)}, + {kThreePrimeKey, sizeof(kThreePrimeKey) - 1, kThreePrimeEncryptedMessage, + sizeof(kThreePrimeEncryptedMessage)}, + {kSixPrimeKey, sizeof(kSixPrimeKey) - 1, kSixPrimeEncryptedMessage, + sizeof(kSixPrimeEncryptedMessage)}, +}; - if (!RSA_check_key(rsa.get())) { - fprintf(stderr, "RSA_check_key failed for %d-prime key.\n", nprimes); - ERR_print_errors_fp(stderr); - return false; - } +class RSAMultiPrimeTest : public testing::TestWithParam<RSAMultiPrimeParam> {}; + +TEST_P(RSAMultiPrimeTest, TestDecrypt) { + const auto ¶m = GetParam(); + bssl::UniquePtr<RSA> rsa( + RSA_private_key_from_bytes(param.der, param.der_size)); + ASSERT_TRUE(rsa); + + EXPECT_TRUE(RSA_check_key(rsa.get())); uint8_t out[256]; size_t out_len; - if (!RSA_decrypt(rsa.get(), &out_len, out, sizeof(out), enc, enc_size, - RSA_PKCS1_PADDING) || - out_len != 11 || - OPENSSL_memcmp(out, "hello world", 11) != 0) { - fprintf(stderr, "%d-prime key failed to decrypt.\n", nprimes); - ERR_print_errors_fp(stderr); - return false; - } - - return true; + ASSERT_TRUE(RSA_decrypt(rsa.get(), &out_len, out, sizeof(out), param.enc, + param.enc_size, RSA_PKCS1_PADDING)); + EXPECT_EQ(Bytes("hello world"), Bytes(out, out_len)); } -static bool TestMultiPrimeKeygen() { - static const char kMessage[] = "Hello world."; - static const size_t kBits = 1024; - uint8_t encrypted[kBits / 8], decrypted[kBits / 8]; - size_t encrypted_len, decrypted_len; +INSTANTIATE_TEST_CASE_P(, RSAMultiPrimeTest, + testing::ValuesIn(kRSAMultiPrimeParams)); +TEST(RSATest, MultiPrimeKeygen) { bssl::UniquePtr<RSA> rsa(RSA_new()); bssl::UniquePtr<BIGNUM> e(BN_new()); - if (!rsa || !e || - !BN_set_word(e.get(), RSA_F4) || - !RSA_generate_multi_prime_key(rsa.get(), kBits, 3, e.get(), nullptr) || - !RSA_check_key(rsa.get()) || - !RSA_encrypt(rsa.get(), &encrypted_len, encrypted, sizeof(encrypted), - (const uint8_t *)kMessage, sizeof(kMessage), - RSA_PKCS1_PADDING) || - !RSA_decrypt(rsa.get(), &decrypted_len, decrypted, sizeof(decrypted), - encrypted, encrypted_len, RSA_PKCS1_PADDING) || - decrypted_len != sizeof(kMessage) || - OPENSSL_memcmp(decrypted, kMessage, sizeof(kMessage)) != 0) { - ERR_print_errors_fp(stderr); - return false; - } + ASSERT_TRUE(rsa); + ASSERT_TRUE(e); + ASSERT_TRUE(BN_set_word(e.get(), RSA_F4)); + + // Test key generation. + static const size_t kBits = 1024; + ASSERT_TRUE( + RSA_generate_multi_prime_key(rsa.get(), kBits, 3, e.get(), nullptr)); + ASSERT_TRUE(RSA_check_key(rsa.get())); - return true; + // Test the key round-trips. + static const char kMessage[] = "Hello world."; + uint8_t encrypted[kBits / 8], decrypted[kBits / 8]; + size_t encrypted_len, decrypted_len; + ASSERT_TRUE(RSA_encrypt(rsa.get(), &encrypted_len, encrypted, + sizeof(encrypted), (const uint8_t *)kMessage, + sizeof(kMessage), RSA_PKCS1_PADDING)); + ASSERT_TRUE(RSA_decrypt(rsa.get(), &decrypted_len, decrypted, + sizeof(decrypted), encrypted, encrypted_len, + RSA_PKCS1_PADDING)); + EXPECT_EQ(Bytes((const uint8_t *)kMessage, sizeof(kMessage)), + Bytes(decrypted, decrypted_len)); } -static bool TestBadKey() { +TEST(RSATest, BadKey) { bssl::UniquePtr<RSA> key(RSA_new()); bssl::UniquePtr<BIGNUM> e(BN_new()); + ASSERT_TRUE(key); + ASSERT_TRUE(e); + ASSERT_TRUE(BN_set_word(e.get(), RSA_F4)); - if (!key || !e || !BN_set_word(e.get(), RSA_F4)) { - return false; - } + // Generate a bad key. + ASSERT_TRUE(RSA_generate_key_ex(key.get(), 512, e.get(), nullptr)); + ASSERT_TRUE(BN_add(key->p, key->p, BN_value_one())); - if (!RSA_generate_key_ex(key.get(), 512, e.get(), nullptr)) { - fprintf(stderr, "RSA_generate_key_ex failed.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - if (!BN_add(key->p, key->p, BN_value_one())) { - fprintf(stderr, "BN error.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - if (RSA_check_key(key.get())) { - fprintf(stderr, "RSA_check_key passed with invalid key!\n"); - return false; - } + // Bad keys are detected. + EXPECT_FALSE(RSA_check_key(key.get())); + // Bad keys may not be parsed. uint8_t *der; size_t der_len; - if (!RSA_private_key_to_bytes(&der, &der_len, key.get())) { - fprintf(stderr, "RSA_private_key_to_bytes failed to serialize bad key\n."); - return false; - } + ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, key.get())); bssl::UniquePtr<uint8_t> delete_der(der); - key.reset(RSA_private_key_from_bytes(der, der_len)); - if (key) { - fprintf(stderr, "RSA_private_key_from_bytes accepted bad key\n."); - } - - ERR_clear_error(); - return true; + EXPECT_FALSE(key); } -static bool TestOnlyDGiven() { +TEST(RSATest, OnlyDGiven) { static const char kN[] = "00e77bbf3889d4ef36a9a25d4d69f3f632eb4362214c74517da6d6aeaa9bd09ac42b2662" "1cd88f3a6eb013772fc3bf9f83914b6467231c630202c35b3e5808c659"; @@ -716,253 +700,134 @@ static bool TestOnlyDGiven() { "0365db9eb6d73b53b015c40cd8db4de7dd7035c68b5ac1bf786d7a4ee2cea316eaeca21a" "73ac365e58713195f2ae9849348525ca855386b6d028e437a9495a01"; - uint8_t buf[64]; - unsigned buf_len = sizeof(buf); bssl::UniquePtr<RSA> key(RSA_new()); - if (!key || - !BN_hex2bn(&key->n, kN) || - !BN_hex2bn(&key->e, kE) || - !BN_hex2bn(&key->d, kD) || - RSA_size(key.get()) > sizeof(buf)) { - return false; - } + ASSERT_TRUE(key); + ASSERT_TRUE(BN_hex2bn(&key->n, kN)); + ASSERT_TRUE(BN_hex2bn(&key->e, kE)); + ASSERT_TRUE(BN_hex2bn(&key->d, kD)); - if (!RSA_check_key(key.get())) { - fprintf(stderr, "RSA_check_key failed with only n, d, and e given.\n"); - ERR_print_errors_fp(stderr); - return false; - } + // Keys with only n, e, and d are functional. + EXPECT_TRUE(RSA_check_key(key.get())); const uint8_t kDummyHash[16] = {0}; - - if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len, - key.get())) { - fprintf(stderr, "RSA_sign failed with only n, d, and e given.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len, - key.get())) { - fprintf(stderr, "RSA_verify failed with only n, d, and e given.\n"); - ERR_print_errors_fp(stderr); - return false; - } + uint8_t buf[64]; + unsigned buf_len = sizeof(buf); + ASSERT_LE(RSA_size(key.get()), sizeof(buf)); + EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, + &buf_len, key.get())); + EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, + buf_len, key.get())); // Keys without the public exponent must continue to work when blinding is // disabled to support Java's RSAPrivateKeySpec API. See // https://bugs.chromium.org/p/boringssl/issues/detail?id=12. bssl::UniquePtr<RSA> key2(RSA_new()); - if (!key2 || - !BN_hex2bn(&key2->n, kN) || - !BN_hex2bn(&key2->d, kD)) { - return false; - } + ASSERT_TRUE(key2); + ASSERT_TRUE(BN_hex2bn(&key2->n, kN)); + ASSERT_TRUE(BN_hex2bn(&key2->d, kD)); key2->flags |= RSA_FLAG_NO_BLINDING; - if (RSA_size(key2.get()) > sizeof(buf)) { - return false; - } - - if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len, - key2.get())) { - fprintf(stderr, "RSA_sign failed with only n and d given.\n"); - ERR_print_errors_fp(stderr); - return false; - } + ASSERT_LE(RSA_size(key2.get()), sizeof(buf)); + EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, + &buf_len, key2.get())); // Verify the signature with |key|. |key2| has no public exponent. - if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len, - key.get())) { - fprintf(stderr, - "Could not verify signature produced from key with only n and d " - "given.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - return true; + EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, + buf_len, key.get())); } -static bool TestRecoverCRTParams() { +TEST(RSATest, RecoverCRTParams) { bssl::UniquePtr<BIGNUM> e(BN_new()); - if (!e || !BN_set_word(e.get(), RSA_F4)) { - return false; - } + ASSERT_TRUE(e); + ASSERT_TRUE(BN_set_word(e.get(), RSA_F4)); - ERR_clear_error(); + bssl::UniquePtr<RSA> key1(RSA_new()); + ASSERT_TRUE(key1); + ASSERT_TRUE(RSA_generate_key_ex(key1.get(), 512, e.get(), nullptr)); - for (unsigned i = 0; i < 1; i++) { - bssl::UniquePtr<RSA> key1(RSA_new()); - if (!key1 || - !RSA_generate_key_ex(key1.get(), 512, e.get(), nullptr)) { - fprintf(stderr, "RSA_generate_key_ex failed.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - if (!RSA_check_key(key1.get())) { - fprintf(stderr, "RSA_check_key failed with original key.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - bssl::UniquePtr<RSA> key2(RSA_new()); - if (!key2) { - return false; - } - key2->n = BN_dup(key1->n); - key2->e = BN_dup(key1->e); - key2->d = BN_dup(key1->d); - if (key2->n == nullptr || key2->e == nullptr || key2->d == nullptr) { - return false; - } - - if (!RSA_recover_crt_params(key2.get())) { - fprintf(stderr, "RSA_recover_crt_params failed.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - uint8_t buf[128]; - unsigned buf_len = sizeof(buf); - if (RSA_size(key2.get()) > buf_len) { - return false; - } - - if (!RSA_check_key(key2.get())) { - fprintf(stderr, "RSA_check_key failed with recovered key.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - const uint8_t kDummyHash[16] = {0}; - if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len, - key2.get())) { - fprintf(stderr, "RSA_sign failed with recovered key.\n"); - ERR_print_errors_fp(stderr); - return false; - } - - if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len, - key2.get())) { - fprintf(stderr, "RSA_verify failed with recovered key.\n"); - ERR_print_errors_fp(stderr); - return false; - } - } + EXPECT_TRUE(RSA_check_key(key1.get())); + + // Create a copy of the key without CRT parameters. + bssl::UniquePtr<RSA> key2(RSA_new()); + ASSERT_TRUE(key2); + key2->n = BN_dup(key1->n); + key2->e = BN_dup(key1->e); + key2->d = BN_dup(key1->d); + ASSERT_TRUE(key2->n); + ASSERT_TRUE(key2->e); + ASSERT_TRUE(key2->d); + + ASSERT_TRUE(RSA_recover_crt_params(key2.get())); - return true; + // The recovered RSA parameters should work. + EXPECT_TRUE(RSA_check_key(key2.get())); + + uint8_t buf[128]; + unsigned buf_len = sizeof(buf); + ASSERT_LE(RSA_size(key2.get()), buf_len); + + const uint8_t kDummyHash[16] = {0}; + EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, + &buf_len, key2.get())); + EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, + buf_len, key2.get())); } -static bool TestASN1() { +TEST(RSATest, ASN1) { // Test that private keys may be decoded. - bssl::UniquePtr<RSA> rsa(RSA_private_key_from_bytes(kKey1, sizeof(kKey1) - 1)); - if (!rsa) { - return false; - } + bssl::UniquePtr<RSA> rsa( + RSA_private_key_from_bytes(kKey1, sizeof(kKey1) - 1)); + ASSERT_TRUE(rsa); // Test that the serialization round-trips. uint8_t *der; size_t der_len; - if (!RSA_private_key_to_bytes(&der, &der_len, rsa.get())) { - return false; - } + ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, rsa.get())); bssl::UniquePtr<uint8_t> delete_der(der); - if (der_len != sizeof(kKey1) - 1 || - OPENSSL_memcmp(der, kKey1, der_len) != 0) { - return false; - } + EXPECT_EQ(Bytes(kKey1, sizeof(kKey1) - 1), Bytes(der, der_len)); // Test that serializing public keys works. - if (!RSA_public_key_to_bytes(&der, &der_len, rsa.get())) { - return false; - } + ASSERT_TRUE(RSA_public_key_to_bytes(&der, &der_len, rsa.get())); delete_der.reset(der); // Public keys may be parsed back out. rsa.reset(RSA_public_key_from_bytes(der, der_len)); - if (!rsa || rsa->p != NULL || rsa->q != NULL) { - return false; - } + ASSERT_TRUE(rsa); + EXPECT_FALSE(rsa->p); + EXPECT_FALSE(rsa->q); // Serializing the result round-trips. uint8_t *der2; size_t der2_len; - if (!RSA_public_key_to_bytes(&der2, &der2_len, rsa.get())) { - return false; - } + ASSERT_TRUE(RSA_public_key_to_bytes(&der2, &der2_len, rsa.get())); bssl::UniquePtr<uint8_t> delete_der2(der2); - if (der_len != der2_len || OPENSSL_memcmp(der, der2, der_len) != 0) { - return false; - } + EXPECT_EQ(Bytes(der, der_len), Bytes(der2, der2_len)); // Public keys cannot be serialized as private keys. - if (RSA_private_key_to_bytes(&der, &der_len, rsa.get())) { + int ok = RSA_private_key_to_bytes(&der, &der_len, rsa.get()); + if (ok) { OPENSSL_free(der); - return false; } + EXPECT_FALSE(ok); ERR_clear_error(); // Public keys with negative moduli are invalid. rsa.reset(RSA_public_key_from_bytes(kEstonianRSAKey, sizeof(kEstonianRSAKey))); - if (rsa) { - return false; - } + EXPECT_FALSE(rsa); ERR_clear_error(); // But |RSA_parse_public_key_buggy| will accept it. CBS cbs; CBS_init(&cbs, kEstonianRSAKey, sizeof(kEstonianRSAKey)); rsa.reset(RSA_parse_public_key_buggy(&cbs)); - if (!rsa || CBS_len(&cbs) != 0) { - return false; - } - - return true; + EXPECT_TRUE(rsa); + EXPECT_EQ(0u, CBS_len(&cbs)); } -static bool TestBadExponent() { - bssl::UniquePtr<RSA> rsa(RSA_public_key_from_bytes(kExponent1RSAKey, - sizeof(kExponent1RSAKey))); - - if (rsa) { - fprintf(stderr, "kExponent1RSAKey parsed but should have failed.\n"); - return false; - } - +TEST(RSATest, BadExponent) { + bssl::UniquePtr<RSA> rsa( + RSA_public_key_from_bytes(kExponent1RSAKey, sizeof(kExponent1RSAKey))); + EXPECT_FALSE(rsa); ERR_clear_error(); - return true; -} - -int main(int argc, char *argv[]) { - CRYPTO_library_init(); - - if (!TestRSA(kKey1, sizeof(kKey1) - 1, kOAEPCiphertext1, - sizeof(kOAEPCiphertext1) - 1) || - !TestRSA(kKey2, sizeof(kKey2) - 1, kOAEPCiphertext2, - sizeof(kOAEPCiphertext2) - 1) || - !TestRSA(kKey3, sizeof(kKey3) - 1, kOAEPCiphertext3, - sizeof(kOAEPCiphertext3) - 1) || - !TestOnlyDGiven() || - !TestRecoverCRTParams() || - !TestBadKey() || - !TestMultiPrimeKey(2, kTwoPrimeKey, sizeof(kTwoPrimeKey) - 1, - kTwoPrimeEncryptedMessage, - sizeof(kTwoPrimeEncryptedMessage)) || - !TestMultiPrimeKey(3, kThreePrimeKey, sizeof(kThreePrimeKey) - 1, - kThreePrimeEncryptedMessage, - sizeof(kThreePrimeEncryptedMessage)) || - !TestMultiPrimeKey(6, kSixPrimeKey, sizeof(kSixPrimeKey) - 1, - kSixPrimeEncryptedMessage, - sizeof(kSixPrimeEncryptedMessage)) || - !TestMultiPrimeKeygen() || - !TestASN1() || - !TestBadExponent()) { - return 1; - } - - printf("PASS\n"); - return 0; } diff --git a/src/crypto/sha/asm/sha1-586.pl b/src/crypto/sha/asm/sha1-586.pl index e815e2b5..acf383d4 100644 --- a/src/crypto/sha/asm/sha1-586.pl +++ b/src/crypto/sha/asm/sha1-586.pl @@ -97,10 +97,12 @@ # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% +# Skylake 6.4 4.1/+55% 4.1(**)/+55% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.5/+41% # Atom 12.5 9.3(*)/+35% # Silvermont 14.5 9.9(*)/+46% +# Goldmont 8.8 6.7/+30% 1.7(***)/+415% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # The discrepancy is because of front-end limitations, so @@ -108,6 +110,8 @@ # limited parallelism. # # (**) As per above comment, the result is for AVX *plus* sh[rl]d. +# +# (***) SHAEXT result $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); diff --git a/src/crypto/sha/asm/sha1-x86_64.pl b/src/crypto/sha/asm/sha1-x86_64.pl index ff960bb9..9a13f6c5 100644..100755 --- a/src/crypto/sha/asm/sha1-x86_64.pl +++ b/src/crypto/sha/asm/sha1-x86_64.pl @@ -73,13 +73,16 @@ # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% +# Skylake 5.18 4.06/+28% 3.54/+46% # Bulldozer 9.11 5.95/+53% # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% +# Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; +# (**) SHAEXT result $flavour = shift; $output = shift; @@ -246,7 +249,7 @@ sha1_block_data_order: jz .Lialu ___ $code.=<<___ if ($shaext); - test \$`1<<29`,%r10d # check SHA bit + test \$`1<<29`,%r10d # check SHA bit jnz _shaext_shortcut ___ $code.=<<___ if ($avx>1); @@ -444,7 +447,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $rx=0; -my $K_XX_XX="%r11"; +my $K_XX_XX="%r14"; +my $fp="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; @@ -465,7 +469,7 @@ $code.=<<___; .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - mov %rsp,%rax + mov %rsp,$fp # frame pointer push %rbx push %rbp push %r12 @@ -474,16 +478,15 @@ _ssse3_shortcut: lea `-64-($win64?6*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-40-6*16(%rax) - movaps %xmm7,-40-5*16(%rax) - movaps %xmm8,-40-4*16(%rax) - movaps %xmm9,-40-3*16(%rax) - movaps %xmm10,-40-2*16(%rax) - movaps %xmm11,-40-1*16(%rax) + movaps %xmm6,-40-6*16($fp) + movaps %xmm7,-40-5*16($fp) + movaps %xmm8,-40-4*16($fp) + movaps %xmm9,-40-3*16($fp) + movaps %xmm10,-40-2*16($fp) + movaps %xmm11,-40-1*16($fp) .Lprologue_ssse3: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -890,21 +893,20 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 + mov -32($fp),%r13 + mov -24($fp),%r12 + mov -16($fp),%rbp + mov -8($fp),%rbx + lea ($fp),%rsp .Lepilogue_ssse3: ret .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 @@ -927,7 +929,7 @@ $code.=<<___; .align 16 sha1_block_data_order_avx: _avx_shortcut: - mov %rsp,%rax + mov %rsp,$fp push %rbx push %rbp push %r12 @@ -937,16 +939,15 @@ _avx_shortcut: vzeroupper ___ $code.=<<___ if ($win64); - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -1254,21 +1255,20 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 + mov -32($fp),%r13 + mov -24($fp),%r12 + mov -16($fp),%rbp + mov -8($fp),%rbx + lea ($fp),%rsp .Lepilogue_avx: ret .size sha1_block_data_order_avx,.-sha1_block_data_order_avx @@ -1294,7 +1294,7 @@ $code.=<<___; .align 16 sha1_block_data_order_avx2: _avx2_shortcut: - mov %rsp,%rax + mov %rsp,$fp push %rbx push %rbp push %r12 @@ -1304,16 +1304,15 @@ _avx2_shortcut: ___ $code.=<<___ if ($win64); lea -6*16(%rsp),%rsp - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx2: ___ $code.=<<___; - mov %rax,%r14 # original %rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument @@ -1733,21 +1732,20 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 + mov -32($fp),%r13 + mov -24($fp),%r12 + mov -16($fp),%rbp + mov -8($fp),%rbx + lea ($fp),%rsp .Lepilogue_avx2: ret .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 @@ -1890,15 +1888,13 @@ ssse3_handler: cmp %r10,%rbx # context->Rip<prologue label jb .Lcommon_seh_tail - mov 152($context),%rax # pull context->Rsp + mov 208($context),%rax # pull context->R11 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 232($context),%rax # pull context->R14 - lea -40-6*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$12,%ecx diff --git a/src/crypto/sha/asm/sha256-586.pl b/src/crypto/sha/asm/sha256-586.pl index 8f4311b6..d85004c8 100644 --- a/src/crypto/sha/asm/sha256-586.pl +++ b/src/crypto/sha/asm/sha256-586.pl @@ -40,7 +40,7 @@ # # Performance in clock cycles per processed byte (less is better): # -# gcc icc x86 asm(*) SIMD x86_64 asm(**) +# gcc icc x86 asm(*) SIMD x86_64 asm(**) # Pentium 46 57 40/38 - - # PIII 36 33 27/24 - - # P4 41 38 28 - 17.3 @@ -50,14 +50,17 @@ # Sandy Bridge 25 - 15.9 12.4 11.6 # Ivy Bridge 24 - 15.0 11.4 10.3 # Haswell 22 - 13.9 9.46 7.80 +# Skylake 20 - 14.9 9.50 7.70 # Bulldozer 36 - 27/22 17.0 13.6 # VIA Nano 36 - 25/22 16.8 16.5 # Atom 50 - 30/25 21.9 18.9 # Silvermont 40 - 34/31 22.9 20.6 +# Goldmont 29 - 20 16.3(***) # # (*) numbers after slash are for unrolled loop, where applicable; # (**) x86_64 assembly performance is presented for reference # purposes, results are best-available; +# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); @@ -263,7 +266,7 @@ my $suffix=shift; &mov ($Coff,"ecx"); &mov ($Doff,"edi"); &mov (&DWP(0,"esp"),"ebx"); # magic - &mov ($E,&DWP(16,"esi")); + &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("edi",&DWP(28,"esi")); @@ -372,7 +375,7 @@ my @AH=($A,$K256); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"ebx"); - &mov ($E,&DWP(16,"esi")); + &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); diff --git a/src/crypto/sha/asm/sha512-586.pl b/src/crypto/sha/asm/sha512-586.pl index d0f91010..6d909eda 100644 --- a/src/crypto/sha/asm/sha512-586.pl +++ b/src/crypto/sha/asm/sha512-586.pl @@ -25,10 +25,12 @@ # Sandy Bridge 58 - 35 11.9 11.2 # Ivy Bridge 50 - 33 11.5 8.17 # Haswell 46 - 29 11.3 7.66 +# Skylake 40 - 26 13.3 7.25 # Bulldozer 121 - 50 14.0 13.5 # VIA Nano 91 - 52 33 14.7 # Atom 126 - 68 48(***) 14.7 # Silvermont 97 - 58 42(***) 17.5 +# Goldmont 80 - 48 19.5 12.0 # # (*) whichever best applicable. # (**) x86_64 assembler performance is presented for reference @@ -376,7 +378,7 @@ if ($sse2) { &set_label("16_79_sse2",16); for ($j=0;$j<2;$j++) { # 2x unroll - #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 + #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15 &movq ("mm5",&QWP(8*(9+16-14),"esp")); &movq ("mm1","mm7"); &psrlq ("mm7",1); diff --git a/src/crypto/sha/asm/sha512-armv8.pl b/src/crypto/sha/asm/sha512-armv8.pl index 75d40431..494e6335 100644 --- a/src/crypto/sha/asm/sha512-armv8.pl +++ b/src/crypto/sha/asm/sha512-armv8.pl @@ -18,7 +18,7 @@ # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) # Denver 2.01 10.5 (+26%) 6.70 (+8%) # X-Gene 20.0 (+100%) 12.8 (+300%(***)) -# +# # (*) Software SHA256 results are of lesser relevance, presented # mostly for informational purposes. # (**) The result is a trade-off: it's possible to improve it by diff --git a/src/crypto/sha/asm/sha512-x86_64.pl b/src/crypto/sha/asm/sha512-x86_64.pl index 186aa9aa..5716791d 100644..100755 --- a/src/crypto/sha/asm/sha512-x86_64.pl +++ b/src/crypto/sha/asm/sha512-x86_64.pl @@ -34,7 +34,7 @@ # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect -# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], +# performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates @@ -86,12 +86,14 @@ # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - +# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # -# (*) whichever best applicable; +# (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions @@ -284,13 +286,13 @@ $code.=<<___ if ($SZ==4); jnz .Lssse3_shortcut ___ $code.=<<___; + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -298,7 +300,7 @@ $code.=<<___; mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp .Lprologue: mov $SZ*0($ctx),$A @@ -365,13 +367,13 @@ $code.=<<___; jb .Lloop mov $_rsp,%rsi - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue: ret .size $func,.-$func @@ -744,13 +746,13 @@ $code.=<<___; .align 64 ${func}_ssse3: .Lssse3_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -758,7 +760,7 @@ ${func}_ssse3: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1065,13 +1067,13 @@ $code.=<<___ if ($win64); movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_ssse3: ret .size ${func}_ssse3,.-${func}_ssse3 @@ -1088,13 +1090,13 @@ $code.=<<___; .align 64 ${func}_xop: .Lxop_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1102,7 +1104,7 @@ ${func}_xop: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1442,13 +1444,13 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_xop: ret .size ${func}_xop,.-${func}_xop @@ -1464,13 +1466,13 @@ $code.=<<___; .align 64 ${func}_avx: .Lavx_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1478,7 +1480,7 @@ ${func}_avx: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1750,13 +1752,13 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx: ret .size ${func}_avx,.-${func}_avx @@ -1766,7 +1768,7 @@ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # -my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; @@ -1815,13 +1817,13 @@ $code.=<<___; .align 64 ${func}_avx2: .Lavx2_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp shl \$4,%rdx # num*16 and \$-256*$SZ,%rsp # align stack frame @@ -1830,7 +1832,7 @@ ${func}_avx2: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -2124,13 +2126,13 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx2: ret .size ${func}_avx2,.-${func}_avx2 @@ -2192,7 +2194,6 @@ ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+3*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/src/crypto/test/test_util.h b/src/crypto/test/test_util.h index d834973e..1447bf69 100644 --- a/src/crypto/test/test_util.h +++ b/src/crypto/test/test_util.h @@ -18,6 +18,7 @@ #include <stddef.h> #include <stdint.h> #include <stdio.h> +#include <string.h> #include <iosfwd> @@ -34,6 +35,9 @@ struct Bytes { Bytes(const uint8_t *data_arg, size_t len_arg) : data(data_arg), len(len_arg) {} + Bytes(const char *str) + : data(reinterpret_cast<const uint8_t *>(str)), len(strlen(str)) {} + template <size_t N> Bytes(const uint8_t (&array)[N]) : data(array), len(N) {} diff --git a/src/crypto/x509/x_name.c b/src/crypto/x509/x_name.c index f97081dc..4abdc916 100644 --- a/src/crypto/x509/x_name.c +++ b/src/crypto/x509/x_name.c @@ -229,12 +229,11 @@ static int x509_name_ex_d2i(ASN1_VALUE **val, if (*val) x509_name_ex_free(val, NULL); + if (!x509_name_ex_new(&nm.a, NULL)) + goto err; /* We've decoded it: now cache encoding */ - if (!x509_name_ex_new(&nm.a, NULL) || !BUF_MEM_grow(nm.x->bytes, p - q)) { - sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s, - local_sk_X509_NAME_ENTRY_pop_free); + if (!BUF_MEM_grow(nm.x->bytes, p - q)) goto err; - } OPENSSL_memcpy(nm.x->bytes->data, q, p - q); /* Convert internal representation to X509_NAME structure */ @@ -245,13 +244,14 @@ static int x509_name_ex_d2i(ASN1_VALUE **val, entry->set = i; if (!sk_X509_NAME_ENTRY_push(nm.x->entries, entry)) goto err; + sk_X509_NAME_ENTRY_set(entries, j, NULL); } - sk_X509_NAME_ENTRY_free(entries); } - sk_STACK_OF_X509_NAME_ENTRY_free(intname.s); ret = x509_name_canon(nm.x); if (!ret) goto err; + sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s, + local_sk_X509_NAME_ENTRY_free); nm.x->modified = 0; *val = nm.a; *in = p; @@ -259,6 +259,8 @@ static int x509_name_ex_d2i(ASN1_VALUE **val, err: if (nm.x != NULL) X509_NAME_free(nm.x); + sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s, + local_sk_X509_NAME_ENTRY_pop_free); OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB); return 0; } @@ -307,8 +309,10 @@ static int x509_name_encode(X509_NAME *a) entries = sk_X509_NAME_ENTRY_new_null(); if (!entries) goto memerr; - if (!sk_STACK_OF_X509_NAME_ENTRY_push(intname.s, entries)) + if (!sk_STACK_OF_X509_NAME_ENTRY_push(intname.s, entries)) { + sk_X509_NAME_ENTRY_free(entries); goto memerr; + } set = entry->set; } if (!sk_X509_NAME_ENTRY_push(entries, entry)) diff --git a/src/include/openssl/ecdsa.h b/src/include/openssl/ecdsa.h index 38907447..8a158b87 100644 --- a/src/include/openssl/ecdsa.h +++ b/src/include/openssl/ecdsa.h @@ -75,7 +75,7 @@ extern "C" { * zero otherwise. */ OPENSSL_EXPORT int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, - unsigned int *sig_len, EC_KEY *key); + unsigned int *sig_len, const EC_KEY *key); /* ECDSA_verify verifies that |sig_len| bytes from |sig| constitute a valid * signature by |key| of |digest|. (The |type| argument should be zero.) It @@ -83,7 +83,7 @@ OPENSSL_EXPORT int ECDSA_sign(int type, const uint8_t *digest, * occurred. */ OPENSSL_EXPORT int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len, const uint8_t *sig, - size_t sig_len, EC_KEY *key); + size_t sig_len, const EC_KEY *key); /* ECDSA_size returns the maximum size of an ECDSA signature using |key|. It * returns zero on error. */ @@ -109,13 +109,13 @@ OPENSSL_EXPORT void ECDSA_SIG_free(ECDSA_SIG *sig); /* ECDSA_do_sign signs |digest_len| bytes from |digest| with |key| and returns * the resulting signature structure, or NULL on error. */ OPENSSL_EXPORT ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, - size_t digest_len, EC_KEY *key); + size_t digest_len, const EC_KEY *key); /* ECDSA_do_verify verifies that |sig| constitutes a valid signature by |key| * of |digest|. It returns one on success or zero if the signature is invalid * or on error. */ OPENSSL_EXPORT int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, EC_KEY *key); + const ECDSA_SIG *sig, const EC_KEY *key); /* Signing with precomputation. @@ -128,22 +128,22 @@ OPENSSL_EXPORT int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, /* ECDSA_sign_setup precomputes parts of an ECDSA signing operation. It sets * |*kinv| and |*rp| to the precomputed values and uses the |ctx| argument, if * not NULL. It returns one on success and zero otherwise. */ -OPENSSL_EXPORT int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv, - BIGNUM **rp); +OPENSSL_EXPORT int ECDSA_sign_setup(const EC_KEY *eckey, BN_CTX *ctx, + BIGNUM **kinv, BIGNUM **rp); /* ECDSA_do_sign_ex is the same as |ECDSA_do_sign| but takes precomputed values * as generated by |ECDSA_sign_setup|. */ OPENSSL_EXPORT ECDSA_SIG *ECDSA_do_sign_ex(const uint8_t *digest, size_t digest_len, const BIGNUM *kinv, const BIGNUM *rp, - EC_KEY *eckey); + const EC_KEY *eckey); /* ECDSA_sign_ex is the same as |ECDSA_sign| but takes precomputed values as * generated by |ECDSA_sign_setup|. */ OPENSSL_EXPORT int ECDSA_sign_ex(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, unsigned int *sig_len, const BIGNUM *kinv, - const BIGNUM *rp, EC_KEY *eckey); + const BIGNUM *rp, const EC_KEY *eckey); /* ASN.1 functions. */ diff --git a/src/include/openssl/pkcs8.h b/src/include/openssl/pkcs8.h index 141ed8d0..70d6f495 100644 --- a/src/include/openssl/pkcs8.h +++ b/src/include/openssl/pkcs8.h @@ -66,45 +66,42 @@ extern "C" { #endif -/* PKCS8_encrypt_pbe serializes and encrypts a PKCS8_PRIV_KEY_INFO with PBES1 or +/* PKCS8_encrypt serializes and encrypts a PKCS8_PRIV_KEY_INFO with PBES1 or * PBES2 as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4, * pbeWithSHAAnd3-KeyTripleDES-CBC and pbeWithSHA1And40BitRC2, defined in PKCS * #12, and PBES2, are supported. PBES2 is selected by setting |cipher| and * passing -1 for |pbe_nid|. Otherwise, PBES1 is used and |cipher| is ignored. * - * The |pass_raw_len| bytes pointed to by |pass_raw| are used as the password. - * Note that any conversions from the password as supplied in a text string - * (such as those specified in B.1 of PKCS #12) must be performed by the caller. + * |pass| is used as the password. If a PBES1 scheme from PKCS #12 is used, this + * will be converted to a raw byte string as specified in B.1 of PKCS #12. If + * |pass| is NULL, it will be encoded as the empty byte string rather than two + * zero bytes, the PKCS #12 encoding of the empty string. * * If |salt| is NULL, a random salt of |salt_len| bytes is generated. If * |salt_len| is zero, a default salt length is used instead. * - * The resulting structure is stored in an X509_SIG which must be freed by the - * caller. - * - * TODO(davidben): Really? An X509_SIG? OpenSSL probably did that because it has - * the same structure as EncryptedPrivateKeyInfo. */ -OPENSSL_EXPORT X509_SIG *PKCS8_encrypt_pbe(int pbe_nid, - const EVP_CIPHER *cipher, - const uint8_t *pass_raw, - size_t pass_raw_len, - const uint8_t *salt, size_t salt_len, - int iterations, - PKCS8_PRIV_KEY_INFO *p8inf); + * The resulting structure is stored in an |X509_SIG| which must be freed by the + * caller. */ +OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, + const char *pass, int pass_len, + const uint8_t *salt, size_t salt_len, + int iterations, + PKCS8_PRIV_KEY_INFO *p8inf); -/* PKCS8_decrypt_pbe decrypts and decodes a PKCS8_PRIV_KEY_INFO with PBES1 or - * PBES2 as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4, +/* PKCS8_decrypt decrypts and decodes a PKCS8_PRIV_KEY_INFO with PBES1 or PBES2 + * as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4, * pbeWithSHAAnd3-KeyTripleDES-CBC and pbeWithSHA1And40BitRC2, and PBES2, * defined in PKCS #12, are supported. * - * The |pass_raw_len| bytes pointed to by |pass_raw| are used as the password. - * Note that any conversions from the password as supplied in a text string - * (such as those specified in B.1 of PKCS #12) must be performed by the caller. + * |pass| is used as the password. If a PBES1 scheme from PKCS #12 is used, this + * will be converted to a raw byte string as specified in B.1 of PKCS #12. If + * |pass| is NULL, it will be encoded as the empty byte string rather than two + * zero bytes, the PKCS #12 encoding of the empty string. * * The resulting structure must be freed by the caller. */ -OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8, - const uint8_t *pass_raw, - size_t pass_raw_len); +OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, + const char *pass, + int pass_len); /* PKCS12_get_key_and_certs parses a PKCS#12 structure from |in|, authenticates * and decrypts it using |password|, sets |*out_key| to the included private @@ -117,24 +114,6 @@ OPENSSL_EXPORT int PKCS12_get_key_and_certs(EVP_PKEY **out_key, /* Deprecated functions. */ -/* PKCS8_encrypt calls |PKCS8_encrypt_pbe| after (in the PKCS#12 case) treating - * |pass| as an ASCII string, appending U+0000, and converting to UCS-2. (So the - * empty password encodes as two NUL bytes.) In the PBES2 case, the password is - * unchanged. */ -OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, - const char *pass, int pass_len, - const uint8_t *salt, size_t salt_len, - int iterations, - PKCS8_PRIV_KEY_INFO *p8inf); - -/* PKCS8_decrypt calls PKCS8_decrypt_pbe after (in the PKCS#12 case) treating - * |pass| as an ASCII string, appending U+0000, and converting to UCS-2. (So the - * empty password encodes as two NUL bytes.) In the PBES2 case, the password is - * unchanged. */ -OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, - const char *pass, - int pass_len); - /* PKCS12_PBE_add does nothing. It exists for compatibility with OpenSSL. */ OPENSSL_EXPORT void PKCS12_PBE_add(void); diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h index 497093db..23e5e9b5 100644 --- a/src/include/openssl/ssl.h +++ b/src/include/openssl/ssl.h @@ -2241,11 +2241,11 @@ OPENSSL_EXPORT void SSL_CTX_set_cert_verify_callback( /* SSL_enable_signed_cert_timestamps causes |ssl| (which must be the client end * of a connection) to request SCTs from the server. See - * https://tools.ietf.org/html/rfc6962. It returns one. + * https://tools.ietf.org/html/rfc6962. * * Call |SSL_get0_signed_cert_timestamp_list| to recover the SCT after the * handshake. */ -OPENSSL_EXPORT int SSL_enable_signed_cert_timestamps(SSL *ssl); +OPENSSL_EXPORT void SSL_enable_signed_cert_timestamps(SSL *ssl); /* SSL_CTX_enable_signed_cert_timestamps enables SCT requests on all client SSL * objects created from |ctx|. @@ -2255,12 +2255,11 @@ OPENSSL_EXPORT int SSL_enable_signed_cert_timestamps(SSL *ssl); OPENSSL_EXPORT void SSL_CTX_enable_signed_cert_timestamps(SSL_CTX *ctx); /* SSL_enable_ocsp_stapling causes |ssl| (which must be the client end of a - * connection) to request a stapled OCSP response from the server. It returns - * one. + * connection) to request a stapled OCSP response from the server. * * Call |SSL_get0_ocsp_response| to recover the OCSP response after the * handshake. */ -OPENSSL_EXPORT int SSL_enable_ocsp_stapling(SSL *ssl); +OPENSSL_EXPORT void SSL_enable_ocsp_stapling(SSL *ssl); /* SSL_CTX_enable_ocsp_stapling enables OCSP stapling on all client SSL objects * created from |ctx|. @@ -3043,7 +3042,6 @@ OPENSSL_EXPORT void SSL_CTX_set_dos_protection_cb( #define SSL_ST_OK 0x03 #define SSL_ST_RENEGOTIATE (0x04 | SSL_ST_INIT) #define SSL_ST_TLS13 (0x05 | SSL_ST_INIT) -#define SSL_ST_ERROR (0x06| SSL_ST_INIT) /* SSL_CB_* are possible values for the |type| parameter in the info * callback and the bitmasks that make them up. */ @@ -3086,8 +3084,7 @@ OPENSSL_EXPORT void SSL_CTX_set_dos_protection_cb( * * |SSL_CB_ACCEPT_LOOP| (respectively, |SSL_CB_CONNECT_LOOP|) is signaled when * a server (respectively, client) handshake progresses. The |value| argument - * is always one. For the duration of the callback, |SSL_state| will return the - * previous state. + * is always one. * * |SSL_CB_ACCEPT_EXIT| (respectively, |SSL_CB_CONNECT_EXIT|) is signaled when * a server (respectively, client) handshake completes, fails, or is paused. @@ -3589,7 +3586,10 @@ OPENSSL_EXPORT const char *SSL_alert_desc_string(int value); typedef struct ssl_conf_ctx_st SSL_CONF_CTX; -/* SSL_state returns the current state of the handshake state machine. */ +/* SSL_state returns |SSL_ST_INIT| if a handshake is in progress and |SSL_ST_OK| + * otherwise. + * + * Use |SSL_is_init| instead. */ OPENSSL_EXPORT int SSL_state(const SSL *ssl); #define SSL_get_state(ssl) SSL_state(ssl) @@ -3805,6 +3805,12 @@ struct ssl_session_st { * early data. If zero, 0-RTT is disallowed. */ uint32_t ticket_max_early_data; + /* early_alpn is the ALPN protocol from the initial handshake. This is only + * stored for TLS 1.3 and above in order to enforce ALPN matching for 0-RTT + * resumptions. */ + uint8_t *early_alpn; + size_t early_alpn_len; + /* extended_master_secret is true if the master secret in this session was * generated using EMS and thus isn't vulnerable to the Triple Handshake * attack. */ @@ -3965,8 +3971,6 @@ struct ssl_ctx_st { void *msg_callback_arg; int verify_mode; - uint8_t sid_ctx_length; - uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH]; int (*default_verify_callback)( int ok, X509_STORE_CTX *ctx); /* called 'verify_callback' in the SSL */ @@ -4061,12 +4065,6 @@ struct ssl_ctx_st { /* The client's Channel ID private key. */ EVP_PKEY *tlsext_channel_id_private; - /* Signed certificate timestamp list to be sent to the client, if requested */ - CRYPTO_BUFFER *signed_cert_timestamp_list; - - /* OCSP response to be sent to the client, if requested. */ - CRYPTO_BUFFER *ocsp_response; - /* keylog_callback, if not NULL, is the key logging callback. See * |SSL_CTX_set_keylog_callback|. */ void (*keylog_callback)(const SSL *ssl, const char *line); @@ -4107,9 +4105,6 @@ struct ssl_ctx_st { /* short_header_enabled is one if a short record header in TLS 1.3 may * be negotiated and zero otherwise. */ unsigned short_header_enabled:1; - - /* TODO(agl): remove once node.js no longer references this. */ - int freelist_max_len; }; diff --git a/src/include/openssl/ssl3.h b/src/include/openssl/ssl3.h index 6a03d1be..fcaeb2df 100644 --- a/src/include/openssl/ssl3.h +++ b/src/include/openssl/ssl3.h @@ -307,6 +307,7 @@ OPENSSL_COMPILE_ASSERT( #define SSL3_ST_CW_FLUSH (0x100 | SSL_ST_CONNECT) #define SSL3_ST_FALSE_START (0x101 | SSL_ST_CONNECT) #define SSL3_ST_VERIFY_SERVER_CERT (0x102 | SSL_ST_CONNECT) +#define SSL3_ST_FINISH_CLIENT_HANDSHAKE (0x103 | SSL_ST_CONNECT) /* write to server */ #define SSL3_ST_CW_CLNT_HELLO_A (0x110 | SSL_ST_CONNECT) /* read from server */ diff --git a/src/include/openssl/time_support.h b/src/include/openssl/time_support.h deleted file mode 100644 index 274b17d1..00000000 --- a/src/include/openssl/time_support.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Written by Richard Levitte (richard@levitte.org) for the OpenSSL - * project 2001. - * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2008. - */ -/* ==================================================================== - * Copyright (c) 2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#ifndef OPENSSL_HEADER_TIME_SUPPORT_H -#define OPENSSL_HEADER_TIME_SUPPORT_H - -#include <openssl/base.h> - -#include <time.h> - -#if defined(__cplusplus) -extern "C" { -#endif - - -/* Wrapper functions for time functions. */ - - -/* OPENSSL_gmtime wraps |gmtime_r|. See the manual page for that function. */ -struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result); - -/* OPENSSL_gmtime_adj updates |tm| by adding |offset_day| days and |offset_sec| - * seconds. */ -int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec); - -/* OPENSSL_gmtime_diff calculates the difference between |from| and |to| and - * outputs the difference as a number of days and seconds in |*out_days| and - * |*out_secs|. */ -int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from, - const struct tm *to); - - -#if defined(__cplusplus) -} /* extern C */ -#endif - -#endif /* OPENSSL_HEADER_TIME_SUPPORT_H */ diff --git a/src/ssl/handshake_client.c b/src/ssl/handshake_client.c index 427213c2..c4f5e8e9 100644 --- a/src/ssl/handshake_client.c +++ b/src/ssl/handshake_client.c @@ -190,21 +190,15 @@ static int ssl3_get_new_session_ticket(SSL_HANDSHAKE *hs); int ssl3_connect(SSL_HANDSHAKE *hs) { SSL *const ssl = hs->ssl; int ret = -1; - int state, skip = 0; assert(ssl->handshake_func == ssl3_connect); assert(!ssl->server); for (;;) { - state = hs->state; + int state = hs->state; switch (hs->state) { case SSL_ST_INIT: - hs->state = SSL_ST_CONNECT; - skip = 1; - break; - - case SSL_ST_CONNECT: ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_START, 1); hs->state = SSL3_ST_CW_CLNT_HELLO_A; break; @@ -254,13 +248,11 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { break; case SSL3_ST_CR_CERT_A: - if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) { ret = ssl3_get_server_certificate(hs); if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CR_CERT_STATUS_A; break; @@ -271,20 +263,16 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_VERIFY_SERVER_CERT; break; case SSL3_ST_VERIFY_SERVER_CERT: - if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) { ret = ssl3_verify_server_cert(hs); if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CR_KEY_EXCH_A; break; @@ -298,13 +286,11 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { break; case SSL3_ST_CR_CERT_REQ_A: - if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) { ret = ssl3_get_certificate_request(hs); if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CR_SRVR_DONE_A; break; @@ -324,8 +310,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CW_KEY_EXCH_A; break; @@ -345,8 +329,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CW_CHANGE; break; @@ -367,8 +349,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CW_CHANNEL_ID_A; break; @@ -379,8 +359,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CW_FINISHED_A; break; @@ -393,7 +371,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { hs->state = SSL3_ST_CW_FLUSH; if (ssl->session != NULL) { - hs->next_state = SSL_ST_OK; + hs->next_state = SSL3_ST_FINISH_CLIENT_HANDSHAKE; } else { /* This is a non-resumption handshake. If it involves ChannelID, then * record the handshake hashes at this point in the session so that @@ -427,8 +405,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_CR_CHANGE; break; @@ -456,7 +432,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ssl->session != NULL) { hs->state = SSL3_ST_CW_CHANGE; } else { - hs->state = SSL_ST_OK; + hs->state = SSL3_ST_FINISH_CLIENT_HANDSHAKE; } break; @@ -466,7 +442,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { goto end; } hs->state = hs->next_state; - if (hs->state != SSL_ST_OK) { + if (hs->state != SSL3_ST_FINISH_CLIENT_HANDSHAKE) { ssl->method->expect_flight(ssl); } break; @@ -476,10 +452,10 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - hs->state = SSL_ST_OK; + hs->state = SSL3_ST_FINISH_CLIENT_HANDSHAKE; break; - case SSL_ST_OK: + case SSL3_ST_FINISH_CLIENT_HANDSHAKE: ssl->method->release_current_message(ssl, 1 /* free_buffer */); SSL_SESSION_free(ssl->s3->established_session); @@ -491,21 +467,21 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { * of the new established_session due to False Start. The caller may * have taken a reference to the temporary session. */ ssl->s3->established_session = - SSL_SESSION_dup(ssl->s3->new_session, SSL_SESSION_DUP_ALL); + SSL_SESSION_dup(hs->new_session, SSL_SESSION_DUP_ALL); if (ssl->s3->established_session == NULL) { - /* Do not stay in SSL_ST_OK, to avoid confusing |SSL_in_init| - * callers. */ - hs->state = SSL_ST_ERROR; - skip = 1; ret = -1; goto end; } ssl->s3->established_session->not_resumable = 0; - SSL_SESSION_free(ssl->s3->new_session); - ssl->s3->new_session = NULL; + SSL_SESSION_free(hs->new_session); + hs->new_session = NULL; } + hs->state = SSL_ST_OK; + break; + + case SSL_ST_OK: { const int is_initial_handshake = !ssl->s3->initial_handshake_complete; ssl->s3->initial_handshake_complete = 1; if (is_initial_handshake) { @@ -516,11 +492,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { ret = 1; ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_DONE, 1); goto end; - - case SSL_ST_ERROR: - OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_HANDSHAKE_FAILURE); - ret = -1; - goto end; + } default: OPENSSL_PUT_ERROR(SSL, SSL_R_UNKNOWN_STATE); @@ -528,13 +500,9 @@ int ssl3_connect(SSL_HANDSHAKE *hs) { goto end; } - if (!ssl->s3->tmp.reuse_message && !skip && hs->state != state) { - int new_state = hs->state; - hs->state = state; + if (hs->state != state) { ssl_do_info_callback(ssl, SSL_CB_CONNECT_LOOP, 1); - hs->state = new_state; } - skip = 0; } end: @@ -944,9 +912,9 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) { goto f_err; } /* Note: session_id could be empty. */ - ssl->s3->new_session->session_id_length = CBS_len(&session_id); - OPENSSL_memcpy(ssl->s3->new_session->session_id, CBS_data(&session_id), - CBS_len(&session_id)); + hs->new_session->session_id_length = CBS_len(&session_id); + OPENSSL_memcpy(hs->new_session->session_id, CBS_data(&session_id), + CBS_len(&session_id)); } const SSL_CIPHER *c = SSL_get_cipher_by_value(cipher_suite); @@ -988,9 +956,9 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) { goto f_err; } } else { - ssl->s3->new_session->cipher = c; + hs->new_session->cipher = c; } - ssl->s3->tmp.new_cipher = c; + hs->new_cipher = c; /* Now that the cipher is known, initialize the handshake hash and hash the * ServerHello. */ @@ -1004,7 +972,7 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) { * which requires hashing the handshake transcript. Otherwise, the handshake * buffer may be released. */ if (ssl->session != NULL || - !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + !ssl_cipher_uses_certificate_auth(hs->new_cipher)) { SSL_TRANSCRIPT_free_buffer(&hs->transcript); } @@ -1030,8 +998,7 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) { } if (ssl->session != NULL && - ssl->s3->tmp.extended_master_secret != - ssl->session->extended_master_secret) { + hs->extended_master_secret != ssl->session->extended_master_secret) { al = SSL_AD_HANDSHAKE_FAILURE; if (ssl->session->extended_master_secret) { OPENSSL_PUT_ERROR(SSL, SSL_R_RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION); @@ -1065,27 +1032,27 @@ static int ssl3_get_server_certificate(SSL_HANDSHAKE *hs) { CBS_init(&cbs, ssl->init_msg, ssl->init_num); uint8_t alert = SSL_AD_DECODE_ERROR; - sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free); + sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free); EVP_PKEY_free(hs->peer_pubkey); hs->peer_pubkey = NULL; - ssl->s3->new_session->certs = ssl_parse_cert_chain( - &alert, &hs->peer_pubkey, NULL, &cbs, ssl->ctx->pool); - if (ssl->s3->new_session->certs == NULL) { + hs->new_session->certs = ssl_parse_cert_chain(&alert, &hs->peer_pubkey, NULL, + &cbs, ssl->ctx->pool); + if (hs->new_session->certs == NULL) { ssl3_send_alert(ssl, SSL3_AL_FATAL, alert); return -1; } - if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0 || + if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0 || CBS_len(&cbs) != 0 || - !ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) { + !ssl->ctx->x509_method->session_cache_objects(hs->new_session)) { OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR); return -1; } if (!ssl_check_leaf_certificate( - ssl, hs->peer_pubkey, - sk_CRYPTO_BUFFER_value(ssl->s3->new_session->certs, 0))) { + hs, hs->peer_pubkey, + sk_CRYPTO_BUFFER_value(hs->new_session->certs, 0))) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER); return -1; } @@ -1126,8 +1093,8 @@ static int ssl3_get_cert_status(SSL_HANDSHAKE *hs) { goto f_err; } - if (!CBS_stow(&ocsp_response, &ssl->s3->new_session->ocsp_response, - &ssl->s3->new_session->ocsp_response_length)) { + if (!CBS_stow(&ocsp_response, &hs->new_session->ocsp_response, + &hs->new_session->ocsp_response_length)) { al = SSL_AD_INTERNAL_ERROR; OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); goto f_err; @@ -1141,8 +1108,8 @@ f_err: static int ssl3_verify_server_cert(SSL_HANDSHAKE *hs) { SSL *const ssl = hs->ssl; - if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result, - ssl->s3->new_session->x509_chain)) { + if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result, + hs->new_session->x509_chain)) { return -1; } @@ -1163,7 +1130,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) { if (ssl->s3->tmp.message_type != SSL3_MT_SERVER_KEY_EXCHANGE) { /* Some ciphers (pure PSK) have an optional ServerKeyExchange message. */ - if (ssl_cipher_requires_server_key_exchange(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_requires_server_key_exchange(hs->new_cipher)) { OPENSSL_PUT_ERROR(SSL, SSL_R_UNEXPECTED_MESSAGE); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_UNEXPECTED_MESSAGE); return -1; @@ -1182,8 +1149,8 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) { CBS_init(&server_key_exchange, ssl->init_msg, ssl->init_num); CBS server_key_exchange_orig = server_key_exchange; - uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey; - uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth; + uint32_t alg_k = hs->new_cipher->algorithm_mkey; + uint32_t alg_a = hs->new_cipher->algorithm_auth; if (alg_a & SSL_aPSK) { CBS psk_identity_hint; @@ -1279,7 +1246,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) { OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR); goto f_err; } - ssl->s3->new_session->group_id = group_id; + hs->new_session->group_id = group_id; /* Ensure the group is consistent with preferences. */ if (!tls1_check_group_id(ssl, group_id)) { @@ -1307,7 +1274,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) { CBS_len(&server_key_exchange_orig) - CBS_len(&server_key_exchange)); /* ServerKeyExchange should be signed by the server's public key. */ - if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) { uint16_t signature_algorithm = 0; if (ssl3_protocol_version(ssl) >= TLS1_2_VERSION) { if (!CBS_get_u16(&server_key_exchange, &signature_algorithm)) { @@ -1318,7 +1285,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) { if (!tls12_check_peer_sigalg(ssl, &al, signature_algorithm)) { goto f_err; } - ssl->s3->new_session->peer_signature_algorithm = signature_algorithm; + hs->new_session->peer_signature_algorithm = signature_algorithm; } else if (hs->peer_pubkey->type == EVP_PKEY_RSA) { signature_algorithm = SSL_SIGN_RSA_PKCS1_MD5_SHA1; } else if (hs->peer_pubkey->type == EVP_PKEY_EC) { @@ -1527,8 +1494,8 @@ static int ssl3_send_client_key_exchange(SSL_HANDSHAKE *hs) { goto err; } - uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey; - uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth; + uint32_t alg_k = hs->new_cipher->algorithm_mkey; + uint32_t alg_a = hs->new_cipher->algorithm_auth; /* If using a PSK key exchange, prepare the pre-shared key. */ unsigned psk_len = 0; @@ -1551,9 +1518,9 @@ static int ssl3_send_client_key_exchange(SSL_HANDSHAKE *hs) { } assert(psk_len <= PSK_MAX_PSK_LEN); - OPENSSL_free(ssl->s3->new_session->psk_identity); - ssl->s3->new_session->psk_identity = BUF_strdup(identity); - if (ssl->s3->new_session->psk_identity == NULL) { + OPENSSL_free(hs->new_session->psk_identity); + hs->new_session->psk_identity = BUF_strdup(identity); + if (hs->new_session->psk_identity == NULL) { OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); goto err; } @@ -1676,13 +1643,12 @@ static int ssl3_send_client_key_exchange(SSL_HANDSHAKE *hs) { goto err; } - ssl->s3->new_session->master_key_length = tls1_generate_master_secret( - hs, ssl->s3->new_session->master_key, pms, pms_len); - if (ssl->s3->new_session->master_key_length == 0) { + hs->new_session->master_key_length = tls1_generate_master_secret( + hs, hs->new_session->master_key, pms, pms_len); + if (hs->new_session->master_key_length == 0) { goto err; } - ssl->s3->new_session->extended_master_secret = - ssl->s3->tmp.extended_master_secret; + hs->new_session->extended_master_secret = hs->extended_master_secret; OPENSSL_cleanse(pms, pms_len); OPENSSL_free(pms); @@ -1740,9 +1706,9 @@ static int ssl3_send_cert_verify(SSL_HANDSHAKE *hs) { uint8_t digest[EVP_MAX_MD_SIZE]; size_t digest_len; - if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash( - &hs->transcript, digest, &digest_len, ssl->s3->new_session, - signature_algorithm)) { + if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(&hs->transcript, digest, + &digest_len, hs->new_session, + signature_algorithm)) { goto err; } @@ -1870,7 +1836,7 @@ static int ssl3_get_new_session_ticket(SSL_HANDSHAKE *hs) { } int session_renewed = ssl->session != NULL; - SSL_SESSION *session = ssl->s3->new_session; + SSL_SESSION *session = hs->new_session; if (session_renewed) { /* The server is sending a new ticket for an existing session. Sessions are * immutable once established, so duplicate all but the ticket of the diff --git a/src/ssl/handshake_server.c b/src/ssl/handshake_server.c index c352dd95..51338e22 100644 --- a/src/ssl/handshake_server.c +++ b/src/ssl/handshake_server.c @@ -202,21 +202,15 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { SSL *const ssl = hs->ssl; uint32_t alg_a; int ret = -1; - int state, skip = 0; assert(ssl->handshake_func == ssl3_accept); assert(ssl->server); for (;;) { - state = hs->state; + int state = hs->state; switch (hs->state) { case SSL_ST_INIT: - hs->state = SSL_ST_ACCEPT; - skip = 1; - break; - - case SSL_ST_ACCEPT: ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_START, 1); hs->state = SSL3_ST_SR_CLNT_HELLO_A; break; @@ -269,13 +263,11 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { break; case SSL3_ST_SW_CERT_A: - if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) { ret = ssl3_send_server_certificate(hs); if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SW_CERT_STATUS_A; break; @@ -286,25 +278,21 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SW_KEY_EXCH_A; break; case SSL3_ST_SW_KEY_EXCH_A: case SSL3_ST_SW_KEY_EXCH_B: - alg_a = ssl->s3->tmp.new_cipher->algorithm_auth; + alg_a = hs->new_cipher->algorithm_auth; /* PSK ciphers send ServerKeyExchange if there is an identity hint. */ - if (ssl_cipher_requires_server_key_exchange(ssl->s3->tmp.new_cipher) || + if (ssl_cipher_requires_server_key_exchange(hs->new_cipher) || ((alg_a & SSL_aPSK) && ssl->psk_identity_hint)) { ret = ssl3_send_server_key_exchange(hs); if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SW_CERT_REQ_A; @@ -316,8 +304,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SW_SRVR_DONE_A; break; @@ -379,8 +365,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SR_CHANNEL_ID_A; break; @@ -391,8 +375,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SR_FINISHED_A; break; @@ -411,7 +393,7 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { } /* If this is a full handshake with ChannelID then record the handshake - * hashes in |ssl->s3->new_session| in case we need them to verify a + * hashes in |hs->new_session| in case we need them to verify a * ChannelID signature on a resumption of this session in the future. */ if (ssl->session == NULL && ssl->s3->tlsext_channel_id_valid) { ret = tls1_record_handshake_hashes_for_channel_id(hs); @@ -427,8 +409,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { if (ret <= 0) { goto end; } - } else { - skip = 1; } hs->state = SSL3_ST_SW_CHANGE; break; @@ -481,12 +461,11 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { /* If we aren't retaining peer certificates then we can discard it * now. */ - if (ssl->s3->new_session != NULL && + if (hs->new_session != NULL && ssl->retain_only_sha256_of_client_certs) { - sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, - CRYPTO_BUFFER_free); - ssl->s3->new_session->certs = NULL; - ssl->ctx->x509_method->session_clear(ssl->s3->new_session); + sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free); + hs->new_session->certs = NULL; + ssl->ctx->x509_method->session_clear(hs->new_session); } SSL_SESSION_free(ssl->s3->established_session); @@ -494,9 +473,9 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { SSL_SESSION_up_ref(ssl->session); ssl->s3->established_session = ssl->session; } else { - ssl->s3->established_session = ssl->s3->new_session; + ssl->s3->established_session = hs->new_session; ssl->s3->established_session->not_resumable = 0; - ssl->s3->new_session = NULL; + hs->new_session = NULL; } if (hs->v2_clienthello) { @@ -518,13 +497,9 @@ int ssl3_accept(SSL_HANDSHAKE *hs) { goto end; } - if (!ssl->s3->tmp.reuse_message && !skip && hs->state != state) { - int new_state = hs->state; - hs->state = state; + if (hs->state != state) { ssl_do_info_callback(ssl, SSL_CB_ACCEPT_LOOP, 1); - hs->state = new_state; } - skip = 0; } end: @@ -921,9 +896,9 @@ static int ssl3_select_certificate(SSL_HANDSHAKE *hs) { /* Negotiate the cipher suite. This must be done after |cert_cb| so the * certificate is finalized. */ - ssl->s3->tmp.new_cipher = + hs->new_cipher = ssl3_choose_cipher(hs, &client_hello, ssl_get_cipher_preferences(ssl)); - if (ssl->s3->tmp.new_cipher == NULL) { + if (hs->new_cipher == NULL) { OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); return -1; @@ -958,8 +933,7 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) { } if (session != NULL) { - if (session->extended_master_secret && - !ssl->s3->tmp.extended_master_secret) { + if (session->extended_master_secret && !hs->extended_master_secret) { /* A ClientHello without EMS that attempts to resume a session with EMS * is fatal to the connection. */ al = SSL_AD_HANDSHAKE_FAILURE; @@ -967,11 +941,10 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) { goto f_err; } - if (!ssl_session_is_resumable(ssl, session) || + if (!ssl_session_is_resumable(hs, session) || /* If the client offers the EMS extension, but the previous session * didn't use it, then negotiate a new session. */ - ssl->s3->tmp.extended_master_secret != - session->extended_master_secret) { + hs->extended_master_secret != session->extended_master_secret) { SSL_SESSION_free(session); session = NULL; } @@ -992,7 +965,7 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) { /* Clear the session ID if we want the session to be single-use. */ if (!(ssl->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER)) { - ssl->s3->new_session->session_id_length = 0; + hs->new_session->session_id_length = 0; } } @@ -1005,13 +978,13 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) { } if (ssl->session == NULL) { - ssl->s3->new_session->cipher = ssl->s3->tmp.new_cipher; + hs->new_session->cipher = hs->new_cipher; /* On new sessions, stash the SNI value in the session. */ if (hs->hostname != NULL) { - OPENSSL_free(ssl->s3->new_session->tlsext_hostname); - ssl->s3->new_session->tlsext_hostname = BUF_strdup(hs->hostname); - if (ssl->s3->new_session->tlsext_hostname == NULL) { + OPENSSL_free(hs->new_session->tlsext_hostname); + hs->new_session->tlsext_hostname = BUF_strdup(hs->hostname); + if (hs->new_session->tlsext_hostname == NULL) { al = SSL_AD_INTERNAL_ERROR; goto f_err; } @@ -1025,14 +998,14 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) { hs->cert_request = 0; } /* CertificateRequest may only be sent in certificate-based ciphers. */ - if (!ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (!ssl_cipher_uses_certificate_auth(hs->new_cipher)) { hs->cert_request = 0; } if (!hs->cert_request) { /* OpenSSL returns X509_V_OK when no certificates are requested. This is * classed by them as a bug, but it's assumed by at least NGINX. */ - ssl->s3->new_session->verify_result = X509_V_OK; + hs->new_session->verify_result = X509_V_OK; } } @@ -1045,7 +1018,7 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) { /* Now that all parameters are known, initialize the handshake hash and hash * the ClientHello. */ if (!SSL_TRANSCRIPT_init_hash(&hs->transcript, ssl3_protocol_version(ssl), - ssl->s3->tmp.new_cipher->algorithm_prf) || + hs->new_cipher->algorithm_prf) || !ssl_hash_current_message(hs)) { goto f_err; } @@ -1073,7 +1046,7 @@ static int ssl3_send_server_hello(SSL_HANDSHAKE *hs) { /* We only accept ChannelIDs on connections with ECDHE in order to avoid a * known attack while we fix ChannelID itself. */ if (ssl->s3->tlsext_channel_id_valid && - (ssl->s3->tmp.new_cipher->algorithm_mkey & SSL_kECDHE) == 0) { + (hs->new_cipher->algorithm_mkey & SSL_kECDHE) == 0) { ssl->s3->tlsext_channel_id_valid = 0; } @@ -1098,7 +1071,7 @@ static int ssl3_send_server_hello(SSL_HANDSHAKE *hs) { /* TODO(davidben): Implement the TLS 1.1 and 1.2 downgrade sentinels once TLS * 1.3 is finalized and we are not implementing a draft version. */ - const SSL_SESSION *session = ssl->s3->new_session; + const SSL_SESSION *session = hs->new_session; if (ssl->session != NULL) { session = ssl->session; } @@ -1110,7 +1083,7 @@ static int ssl3_send_server_hello(SSL_HANDSHAKE *hs) { !CBB_add_u8_length_prefixed(&body, &session_id) || !CBB_add_bytes(&session_id, session->session_id, session->session_id_length) || - !CBB_add_u16(&body, ssl_cipher_get_value(ssl->s3->tmp.new_cipher)) || + !CBB_add_u16(&body, ssl_cipher_get_value(hs->new_cipher)) || !CBB_add_u8(&body, 0 /* no compression */) || !ssl_add_serverhello_tlsext(hs, &body) || !ssl_add_message_cbb(ssl, &cbb)) { @@ -1142,8 +1115,9 @@ static int ssl3_send_certificate_status(SSL_HANDSHAKE *hs) { SSL3_MT_CERTIFICATE_STATUS) || !CBB_add_u8(&body, TLSEXT_STATUSTYPE_ocsp) || !CBB_add_u24_length_prefixed(&body, &ocsp_response) || - !CBB_add_bytes(&ocsp_response, CRYPTO_BUFFER_data(ssl->ocsp_response), - CRYPTO_BUFFER_len(ssl->ocsp_response)) || + !CBB_add_bytes(&ocsp_response, + CRYPTO_BUFFER_data(ssl->cert->ocsp_response), + CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) || !ssl_add_message_cbb(ssl, &cbb)) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); CBB_cleanup(&cbb); @@ -1160,8 +1134,8 @@ static int ssl3_send_server_key_exchange(SSL_HANDSHAKE *hs) { /* Put together the parameters. */ if (hs->state == SSL3_ST_SW_KEY_EXCH_A) { - uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey; - uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth; + uint32_t alg_k = hs->new_cipher->algorithm_mkey; + uint32_t alg_a = hs->new_cipher->algorithm_auth; /* Pre-allocate enough room to comfortably fit an ECDHE public key. */ if (!CBB_init(&cbb, 128)) { @@ -1214,7 +1188,7 @@ static int ssl3_send_server_key_exchange(SSL_HANDSHAKE *hs) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); goto err; } - ssl->s3->new_session->group_id = group_id; + hs->new_session->group_id = group_id; /* Set up ECDH, generate a key, and emit the public half. */ if (!SSL_ECDH_CTX_init(&hs->ecdh_ctx, group_id) || @@ -1242,7 +1216,7 @@ static int ssl3_send_server_key_exchange(SSL_HANDSHAKE *hs) { } /* Add a signature. */ - if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) { if (!ssl_has_private_key(ssl)) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); goto err; @@ -1439,7 +1413,7 @@ static int ssl3_get_client_certificate(SSL_HANDSHAKE *hs) { /* OpenSSL returns X509_V_OK when no certificates are received. This is * classed by them as a bug, but it's assumed by at least NGINX. */ - ssl->s3->new_session->verify_result = X509_V_OK; + hs->new_session->verify_result = X509_V_OK; ssl->s3->tmp.reuse_message = 1; return 1; } @@ -1456,29 +1430,28 @@ static int ssl3_get_client_certificate(SSL_HANDSHAKE *hs) { CBS certificate_msg; CBS_init(&certificate_msg, ssl->init_msg, ssl->init_num); - sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free); + sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free); EVP_PKEY_free(hs->peer_pubkey); hs->peer_pubkey = NULL; uint8_t alert = SSL_AD_DECODE_ERROR; - ssl->s3->new_session->certs = - ssl_parse_cert_chain(&alert, &hs->peer_pubkey, - ssl->retain_only_sha256_of_client_certs - ? ssl->s3->new_session->peer_sha256 - : NULL, - &certificate_msg, ssl->ctx->pool); - if (ssl->s3->new_session->certs == NULL) { + hs->new_session->certs = ssl_parse_cert_chain( + &alert, &hs->peer_pubkey, + ssl->retain_only_sha256_of_client_certs ? hs->new_session->peer_sha256 + : NULL, + &certificate_msg, ssl->ctx->pool); + if (hs->new_session->certs == NULL) { ssl3_send_alert(ssl, SSL3_AL_FATAL, alert); return -1; } if (CBS_len(&certificate_msg) != 0 || - !ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) { + !ssl->ctx->x509_method->session_cache_objects(hs->new_session)) { OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR); return -1; } - if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) { + if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) { /* No client certificate so the handshake buffer may be discarded. */ SSL_TRANSCRIPT_free_buffer(&hs->transcript); @@ -1499,17 +1472,17 @@ static int ssl3_get_client_certificate(SSL_HANDSHAKE *hs) { /* OpenSSL returns X509_V_OK when no certificates are received. This is * classed by them as a bug, but it's assumed by at least NGINX. */ - ssl->s3->new_session->verify_result = X509_V_OK; + hs->new_session->verify_result = X509_V_OK; return 1; } /* The hash will have been filled in. */ if (ssl->retain_only_sha256_of_client_certs) { - ssl->s3->new_session->peer_sha256_valid = 1; + hs->new_session->peer_sha256_valid = 1; } - if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result, - ssl->s3->new_session->x509_chain)) { + if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result, + hs->new_session->x509_chain)) { return -1; } return 1; @@ -1541,8 +1514,8 @@ static int ssl3_get_client_key_exchange(SSL_HANDSHAKE *hs) { } CBS_init(&client_key_exchange, ssl->init_msg, ssl->init_num); - alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey; - alg_a = ssl->s3->tmp.new_cipher->algorithm_auth; + alg_k = hs->new_cipher->algorithm_mkey; + alg_a = hs->new_cipher->algorithm_auth; /* If using a PSK key exchange, prepare the pre-shared key. */ if (alg_a & SSL_aPSK) { @@ -1570,15 +1543,15 @@ static int ssl3_get_client_key_exchange(SSL_HANDSHAKE *hs) { goto f_err; } - if (!CBS_strdup(&psk_identity, &ssl->s3->new_session->psk_identity)) { + if (!CBS_strdup(&psk_identity, &hs->new_session->psk_identity)) { al = SSL_AD_INTERNAL_ERROR; OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); goto f_err; } /* Look up the key for the identity. */ - psk_len = ssl->psk_server_callback(ssl, ssl->s3->new_session->psk_identity, - psk, sizeof(psk)); + psk_len = ssl->psk_server_callback(ssl, hs->new_session->psk_identity, psk, + sizeof(psk)); if (psk_len > PSK_MAX_PSK_LEN) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); al = SSL_AD_INTERNAL_ERROR; @@ -1763,14 +1736,12 @@ static int ssl3_get_client_key_exchange(SSL_HANDSHAKE *hs) { } /* Compute the master secret */ - ssl->s3->new_session->master_key_length = - tls1_generate_master_secret(hs, ssl->s3->new_session->master_key, - premaster_secret, premaster_secret_len); - if (ssl->s3->new_session->master_key_length == 0) { + hs->new_session->master_key_length = tls1_generate_master_secret( + hs, hs->new_session->master_key, premaster_secret, premaster_secret_len); + if (hs->new_session->master_key_length == 0) { goto err; } - ssl->s3->new_session->extended_master_secret = - ssl->s3->tmp.extended_master_secret; + hs->new_session->extended_master_secret = hs->extended_master_secret; OPENSSL_cleanse(premaster_secret, premaster_secret_len); OPENSSL_free(premaster_secret); @@ -1823,7 +1794,7 @@ static int ssl3_get_cert_verify(SSL_HANDSHAKE *hs) { if (!tls12_check_peer_sigalg(ssl, &al, signature_algorithm)) { goto f_err; } - ssl->s3->new_session->peer_signature_algorithm = signature_algorithm; + hs->new_session->peer_signature_algorithm = signature_algorithm; } else if (hs->peer_pubkey->type == EVP_PKEY_RSA) { signature_algorithm = SSL_SIGN_RSA_PKCS1_MD5_SHA1; } else if (hs->peer_pubkey->type == EVP_PKEY_EC) { @@ -1849,7 +1820,7 @@ static int ssl3_get_cert_verify(SSL_HANDSHAKE *hs) { uint8_t digest[EVP_MAX_MD_SIZE]; size_t digest_len; if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(&hs->transcript, digest, - &digest_len, ssl->s3->new_session, + &digest_len, hs->new_session, signature_algorithm)) { goto err; } @@ -1946,8 +1917,8 @@ static int ssl3_send_new_session_ticket(SSL_HANDSHAKE *hs) { SSL_SESSION *session_copy = NULL; if (ssl->session == NULL) { /* Fix the timeout to measure from the ticket issuance time. */ - ssl_session_rebase_time(ssl, ssl->s3->new_session); - session = ssl->s3->new_session; + ssl_session_rebase_time(ssl, hs->new_session); + session = hs->new_session; } else { /* We are renewing an existing session. Duplicate the session to adjust the * timeout. */ diff --git a/src/ssl/internal.h b/src/ssl/internal.h index 5b93f475..b2c9fcd4 100644 --- a/src/ssl/internal.h +++ b/src/ssl/internal.h @@ -854,9 +854,9 @@ STACK_OF(X509_NAME) * int ssl_add_client_CA_list(SSL *ssl, CBB *cbb); /* ssl_check_leaf_certificate returns one if |pkey| and |leaf| are suitable as - * a server's leaf certificate for |ssl|. Otherwise, it returns zero and pushes + * a server's leaf certificate for |hs|. Otherwise, it returns zero and pushes * an error on the error queue. */ -int ssl_check_leaf_certificate(SSL *ssl, EVP_PKEY *pkey, +int ssl_check_leaf_certificate(SSL_HANDSHAKE *hs, EVP_PKEY *pkey, const CRYPTO_BUFFER *leaf); @@ -1049,6 +1049,13 @@ struct ssl_handshake_st { /* peer_pubkey is the public key parsed from the peer's leaf certificate. */ EVP_PKEY *peer_pubkey; + /* new_session is the new mutable session being established by the current + * handshake. It should not be cached. */ + SSL_SESSION *new_session; + + /* new_cipher is the cipher being negotiated in this handshake. */ + const SSL_CIPHER *new_cipher; + /* key_block is the record-layer key block for TLS 1.2 and earlier. */ uint8_t *key_block; uint8_t key_block_len; @@ -1100,6 +1107,10 @@ struct ssl_handshake_st { /* v2_clienthello is one if we received a V2ClientHello. */ unsigned v2_clienthello:1; + /* extended_master_secret is one if the extended master secret extension is + * negotiated in this handshake. */ + unsigned extended_master_secret:1; + /* client_version is the value sent or received in the ClientHello version. */ uint16_t client_version; } /* SSL_HANDSHAKE */; @@ -1323,6 +1334,17 @@ typedef struct cert_st { /* Optional X509_STORE for certificate validation. If NULL the parent SSL_CTX * store is used instead. */ X509_STORE *verify_store; + + /* Signed certificate timestamp list to be sent to the client, if requested */ + CRYPTO_BUFFER *signed_cert_timestamp_list; + + /* OCSP response to be sent to the client, if requested. */ + CRYPTO_BUFFER *ocsp_response; + + /* sid_ctx partitions the session space within a shared session cache or + * ticket key. Only sessions with a matching value will be accepted. */ + uint8_t sid_ctx_length; + uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH]; } CERT; /* SSL_METHOD is a compatibility structure to support the legacy version-locked @@ -1594,9 +1616,6 @@ typedef struct ssl3_state_st { * TODO(davidben): Move everything not needed after the handshake completes to * |hs| and remove this. */ struct { - /* used to hold the new cipher we are going to use */ - const SSL_CIPHER *new_cipher; - int message_type; int reuse_message; @@ -1604,20 +1623,8 @@ typedef struct ssl3_state_st { uint8_t new_mac_secret_len; uint8_t new_key_len; uint8_t new_fixed_iv_len; - - /* extended_master_secret indicates whether the extended master secret - * computation is used in this handshake. Note that this is different from - * whether it was used for the current session. If this is a resumption - * handshake then EMS might be negotiated in the client and server hello - * messages, but it doesn't matter if the session that's being resumed - * didn't use it to create the master secret initially. */ - char extended_master_secret; } tmp; - /* new_session is the new mutable session being established by the current - * handshake. It should not be cached. */ - SSL_SESSION *new_session; - /* established_session is the session established by the connection. This * session is only filled upon the completion of the handshake and is * immutable. */ @@ -1798,11 +1805,6 @@ struct ssl_st { * milliseconds. It's used to initialize the timer any time it's restarted. */ unsigned initial_timeout_duration_ms; - /* the session_id_context is used to ensure sessions are only reused - * in the appropriate context */ - uint8_t sid_ctx_length; - uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH]; - /* session is the configured session to be offered by the client. This session * is immutable. */ SSL_SESSION *session; @@ -1887,12 +1889,6 @@ struct ssl_st { * hash of the peer's certificate and then discard it to save memory and * session space. Only effective on the server side. */ unsigned retain_only_sha256_of_client_certs:1; - - /* Signed certificate timestamp list to be sent to the client, if requested */ - CRYPTO_BUFFER *signed_cert_timestamp_list; - - /* OCSP response to be sent to the client, if requested. */ - CRYPTO_BUFFER *ocsp_response; }; /* From draft-ietf-tls-tls13-18, used in determining PSK modes. */ @@ -1936,9 +1932,10 @@ int ssl_session_is_context_valid(const SSL *ssl, const SSL_SESSION *session); * it has expired. */ int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session); -/* ssl_session_is_resumable returns one if |session| is resumable for |ssl| and +/* ssl_session_is_resumable returns one if |session| is resumable for |hs| and * zero otherwise. */ -int ssl_session_is_resumable(const SSL *ssl, const SSL_SESSION *session); +int ssl_session_is_resumable(const SSL_HANDSHAKE *hs, + const SSL_SESSION *session); /* SSL_SESSION_get_digest returns the digest used in |session|. If the digest is * invalid, it returns NULL. */ diff --git a/src/ssl/s3_both.c b/src/ssl/s3_both.c index d3f9421b..7fd09c65 100644 --- a/src/ssl/s3_both.c +++ b/src/ssl/s3_both.c @@ -167,6 +167,7 @@ void ssl_handshake_free(SSL_HANDSHAKE *hs) { OPENSSL_free(hs->cookie); OPENSSL_free(hs->key_share_bytes); OPENSSL_free(hs->public_key); + SSL_SESSION_free(hs->new_session); OPENSSL_free(hs->peer_sigalgs); OPENSSL_free(hs->peer_supported_group_list); OPENSSL_free(hs->peer_key); @@ -678,7 +679,6 @@ static int read_v2_client_hello(SSL *ssl) { } int ssl3_get_message(SSL *ssl) { -again: /* Re-create the handshake buffer if needed. */ if (ssl->init_buf == NULL) { ssl->init_buf = BUF_MEM_new(); @@ -733,16 +733,6 @@ again: ssl->s3->tmp.message_type = ((const uint8_t *)ssl->init_buf->data)[0]; ssl->init_msg = (uint8_t*)ssl->init_buf->data + SSL3_HM_HEADER_LENGTH; ssl->init_num = ssl->init_buf->length - SSL3_HM_HEADER_LENGTH; - - /* Ignore stray HelloRequest messages in the handshake before TLS 1.3. Per RFC - * 5246, section 7.4.1.1, the server may send HelloRequest at any time. */ - if (!ssl->server && SSL_in_init(ssl) && - (!ssl->s3->have_version || ssl3_protocol_version(ssl) < TLS1_3_VERSION) && - ssl->s3->tmp.message_type == SSL3_MT_HELLO_REQUEST && - ssl->init_num == 0) { - goto again; - } - return 1; } diff --git a/src/ssl/s3_lib.c b/src/ssl/s3_lib.c index 1c723cd2..57a27c70 100644 --- a/src/ssl/s3_lib.c +++ b/src/ssl/s3_lib.c @@ -197,7 +197,6 @@ void ssl3_free(SSL *ssl) { ssl_read_buffer_clear(ssl); ssl_write_buffer_clear(ssl); - SSL_SESSION_free(ssl->s3->new_session); SSL_SESSION_free(ssl->s3->established_session); ssl_handshake_free(ssl->s3->hs); OPENSSL_free(ssl->s3->next_proto_negotiated); diff --git a/src/ssl/ssl_asn1.c b/src/ssl/ssl_asn1.c index 3582864e..3533225a 100644 --- a/src/ssl/ssl_asn1.c +++ b/src/ssl/ssl_asn1.c @@ -130,6 +130,7 @@ * peerSignatureAlgorithm [23] INTEGER OPTIONAL, * ticketMaxEarlyData [24] INTEGER OPTIONAL, * authTimeout [25] INTEGER OPTIONAL, -- defaults to timeout + * earlyALPN [26] OCTET STRING OPTIONAL, * } * * Note: historically this serialization has included other optional @@ -186,6 +187,8 @@ static const int kTicketMaxEarlyDataTag = CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 24; static const int kAuthTimeoutTag = CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 25; +static const int kEarlyALPNTag = + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 26; static int SSL_SESSION_to_bytes_full(const SSL_SESSION *in, uint8_t **out_data, size_t *out_len, int for_ticket) { @@ -412,6 +415,16 @@ static int SSL_SESSION_to_bytes_full(const SSL_SESSION *in, uint8_t **out_data, goto err; } + if (in->early_alpn) { + if (!CBB_add_asn1(&session, &child, kEarlyALPNTag) || + !CBB_add_asn1(&child, &child2, CBS_ASN1_OCTETSTRING) || + !CBB_add_bytes(&child2, (const uint8_t *)in->early_alpn, + in->early_alpn_len)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + goto err; + } + } + if (!CBB_finish(&cbb, out_data, out_len)) { OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); goto err; @@ -800,6 +813,8 @@ SSL_SESSION *SSL_SESSION_parse(CBS *cbs, const SSL_X509_METHOD *x509_method, kTicketMaxEarlyDataTag, 0) || !SSL_SESSION_parse_long(&session, &ret->auth_timeout, kAuthTimeoutTag, ret->timeout) || + !SSL_SESSION_parse_octet_string(&session, &ret->early_alpn, + &ret->early_alpn_len, kEarlyALPNTag) || CBS_len(&session) != 0) { OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION); goto err; diff --git a/src/ssl/ssl_cert.c b/src/ssl/ssl_cert.c index 4177a482..c60c6fa2 100644 --- a/src/ssl/ssl_cert.c +++ b/src/ssl/ssl_cert.c @@ -203,6 +203,19 @@ CERT *ssl_cert_dup(CERT *cert) { ret->verify_store = cert->verify_store; } + if (cert->signed_cert_timestamp_list != NULL) { + CRYPTO_BUFFER_up_ref(cert->signed_cert_timestamp_list); + ret->signed_cert_timestamp_list = cert->signed_cert_timestamp_list; + } + + if (cert->ocsp_response != NULL) { + CRYPTO_BUFFER_up_ref(cert->ocsp_response); + ret->ocsp_response = cert->ocsp_response; + } + + ret->sid_ctx_length = cert->sid_ctx_length; + OPENSSL_memcpy(ret->sid_ctx, cert->sid_ctx, sizeof(ret->sid_ctx)); + return ret; err: @@ -235,6 +248,8 @@ void ssl_cert_free(CERT *c) { ssl_cert_clear_certs(c); OPENSSL_free(c->sigalgs); X509_STORE_free(c->verify_store); + CRYPTO_BUFFER_free(c->signed_cert_timestamp_list); + CRYPTO_BUFFER_free(c->ocsp_response); OPENSSL_free(c); } @@ -883,20 +898,20 @@ void SSL_set_cert_cb(SSL *ssl, int (*cb)(SSL *ssl, void *arg), void *arg) { ssl_cert_set_cert_cb(ssl->cert, cb, arg); } -int ssl_check_leaf_certificate(SSL *ssl, EVP_PKEY *pkey, +int ssl_check_leaf_certificate(SSL_HANDSHAKE *hs, EVP_PKEY *pkey, const CRYPTO_BUFFER *leaf) { + SSL *const ssl = hs->ssl; assert(ssl3_protocol_version(ssl) < TLS1_3_VERSION); /* Check the certificate's type matches the cipher. */ - const SSL_CIPHER *cipher = ssl->s3->tmp.new_cipher; - int expected_type = ssl_cipher_get_key_type(cipher); + int expected_type = ssl_cipher_get_key_type(hs->new_cipher); assert(expected_type != EVP_PKEY_NONE); if (pkey->type != expected_type) { OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CERTIFICATE_TYPE); return 0; } - if (cipher->algorithm_auth & SSL_aECDSA) { + if (hs->new_cipher->algorithm_auth & SSL_aECDSA) { CBS leaf_cbs; CBS_init(&leaf_cbs, CRYPTO_BUFFER_data(leaf), CRYPTO_BUFFER_len(leaf)); /* ECDSA and ECDH certificates use the same public key format. Instead, @@ -956,3 +971,42 @@ void SSL_CTX_set_client_cert_cb(SSL_CTX *ctx, int (*cb)(SSL *ssl, SSL_CTX_set_cert_cb(ctx, do_client_cert_cb, NULL); ctx->client_cert_cb = cb; } + +static int set_signed_cert_timestamp_list(CERT *cert, const uint8_t *list, + size_t list_len) { + CBS sct_list; + CBS_init(&sct_list, list, list_len); + if (!ssl_is_sct_list_valid(&sct_list)) { + OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST); + return 0; + } + + CRYPTO_BUFFER_free(cert->signed_cert_timestamp_list); + cert->signed_cert_timestamp_list = + CRYPTO_BUFFER_new(CBS_data(&sct_list), CBS_len(&sct_list), NULL); + return cert->signed_cert_timestamp_list != NULL; +} + +int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list, + size_t list_len) { + return set_signed_cert_timestamp_list(ctx->cert, list, list_len); +} + +int SSL_set_signed_cert_timestamp_list(SSL *ssl, const uint8_t *list, + size_t list_len) { + return set_signed_cert_timestamp_list(ssl->cert, list, list_len); +} + +int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response, + size_t response_len) { + CRYPTO_BUFFER_free(ctx->cert->ocsp_response); + ctx->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL); + return ctx->cert->ocsp_response != NULL; +} + +int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response, + size_t response_len) { + CRYPTO_BUFFER_free(ssl->cert->ocsp_response); + ssl->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL); + return ssl->cert->ocsp_response != NULL; +} diff --git a/src/ssl/ssl_lib.c b/src/ssl/ssl_lib.c index c946b77b..d0151bb5 100644 --- a/src/ssl/ssl_lib.c +++ b/src/ssl/ssl_lib.c @@ -363,8 +363,6 @@ void SSL_CTX_free(SSL_CTX *ctx) { OPENSSL_free(ctx->psk_identity_hint); OPENSSL_free(ctx->supported_group_list); OPENSSL_free(ctx->alpn_client_proto_list); - CRYPTO_BUFFER_free(ctx->signed_cert_timestamp_list); - CRYPTO_BUFFER_free(ctx->ocsp_response); EVP_PKEY_free(ctx->tlsext_channel_id_private); OPENSSL_free(ctx); @@ -405,9 +403,6 @@ SSL *SSL_new(SSL_CTX *ctx) { ssl->msg_callback = ctx->msg_callback; ssl->msg_callback_arg = ctx->msg_callback_arg; ssl->verify_mode = ctx->verify_mode; - ssl->sid_ctx_length = ctx->sid_ctx_length; - assert(ssl->sid_ctx_length <= sizeof ssl->sid_ctx); - OPENSSL_memcpy(&ssl->sid_ctx, &ctx->sid_ctx, sizeof(ssl->sid_ctx)); ssl->verify_callback = ctx->default_verify_callback; ssl->retain_only_sha256_of_client_certs = ctx->retain_only_sha256_of_client_certs; @@ -472,18 +467,6 @@ SSL *SSL_new(SSL_CTX *ctx) { ssl->signed_cert_timestamps_enabled = ctx->signed_cert_timestamps_enabled; ssl->ocsp_stapling_enabled = ctx->ocsp_stapling_enabled; - /* If the context has an SCT list, use it. */ - if (ctx->signed_cert_timestamp_list != NULL) { - CRYPTO_BUFFER_up_ref(ctx->signed_cert_timestamp_list); - ssl->signed_cert_timestamp_list = ctx->signed_cert_timestamp_list; - } - - /* If the context has an OCSP response, use it. */ - if (ctx->ocsp_response != NULL) { - CRYPTO_BUFFER_up_ref(ctx->ocsp_response); - ssl->ocsp_response = ctx->ocsp_response; - } - return ssl; err: @@ -522,8 +505,6 @@ void SSL_free(SSL *ssl) { OPENSSL_free(ssl->psk_identity_hint); sk_X509_NAME_pop_free(ssl->client_CA, X509_NAME_free); sk_SRTP_PROTECTION_PROFILE_free(ssl->srtp_profiles); - CRYPTO_BUFFER_free(ssl->signed_cert_timestamp_list); - CRYPTO_BUFFER_free(ssl->ocsp_response); if (ssl->method != NULL) { ssl->method->ssl_free(ssl); @@ -800,10 +781,11 @@ int SSL_shutdown(SSL *ssl) { return -1; } - /* We can't shutdown properly if we are in the middle of a handshake. */ + /* If we are in the middle of a handshake, silently succeed. Consumers often + * call this function before |SSL_free|, whether the handshake succeeded or + * not. We assume the caller has already handled failed handshakes. */ if (SSL_in_init(ssl)) { - OPENSSL_PUT_ERROR(SSL, SSL_R_SHUTDOWN_WHILE_IN_INIT); - return -1; + return 1; } if (ssl->quiet_shutdown) { @@ -1088,37 +1070,32 @@ err: return 0; } -int SSL_CTX_set_session_id_context(SSL_CTX *ctx, const uint8_t *sid_ctx, +static int set_session_id_context(CERT *cert, const uint8_t *sid_ctx, size_t sid_ctx_len) { - if (sid_ctx_len > sizeof(ctx->sid_ctx)) { + if (sid_ctx_len > sizeof(cert->sid_ctx)) { OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG); return 0; } - assert(sizeof(ctx->sid_ctx) < 256); - ctx->sid_ctx_length = (uint8_t)sid_ctx_len; - OPENSSL_memcpy(ctx->sid_ctx, sid_ctx, sid_ctx_len); - + OPENSSL_COMPILE_ASSERT(sizeof(cert->sid_ctx) < 256, sid_ctx_too_large); + cert->sid_ctx_length = (uint8_t)sid_ctx_len; + OPENSSL_memcpy(cert->sid_ctx, sid_ctx, sid_ctx_len); return 1; } +int SSL_CTX_set_session_id_context(SSL_CTX *ctx, const uint8_t *sid_ctx, + size_t sid_ctx_len) { + return set_session_id_context(ctx->cert, sid_ctx, sid_ctx_len); +} + int SSL_set_session_id_context(SSL *ssl, const uint8_t *sid_ctx, size_t sid_ctx_len) { - if (sid_ctx_len > sizeof(ssl->sid_ctx)) { - OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG); - return 0; - } - - assert(sizeof(ssl->sid_ctx) < 256); - ssl->sid_ctx_length = (uint8_t)sid_ctx_len; - OPENSSL_memcpy(ssl->sid_ctx, sid_ctx, sid_ctx_len); - - return 1; + return set_session_id_context(ssl->cert, sid_ctx, sid_ctx_len); } const uint8_t *SSL_get0_session_id_context(const SSL *ssl, size_t *out_len) { - *out_len = ssl->sid_ctx_length; - return ssl->sid_ctx; + *out_len = ssl->cert->sid_ctx_length; + return ssl->cert->sid_ctx; } void ssl_cipher_preference_list_free( @@ -1247,11 +1224,26 @@ size_t SSL_get_peer_finished(const SSL *ssl, void *buf, size_t count) { int SSL_get_verify_mode(const SSL *ssl) { return ssl->verify_mode; } int SSL_get_extms_support(const SSL *ssl) { + /* TLS 1.3 does not require extended master secret and always reports as + * supporting it. */ if (!ssl->s3->have_version) { return 0; } - return ssl3_protocol_version(ssl) >= TLS1_3_VERSION || - ssl->s3->tmp.extended_master_secret == 1; + if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION) { + return 1; + } + + /* If the initial handshake completed, query the established session. */ + if (ssl->s3->established_session != NULL) { + return ssl->s3->established_session->extended_master_secret; + } + + /* Otherwise, query the in-progress handshake. */ + if (ssl->s3->hs != NULL) { + return ssl->s3->hs->extended_master_secret; + } + assert(0); + return 0; } int SSL_CTX_get_read_ahead(const SSL_CTX *ctx) { return 0; } @@ -1583,18 +1575,16 @@ void SSL_CTX_enable_signed_cert_timestamps(SSL_CTX *ctx) { ctx->signed_cert_timestamps_enabled = 1; } -int SSL_enable_signed_cert_timestamps(SSL *ssl) { +void SSL_enable_signed_cert_timestamps(SSL *ssl) { ssl->signed_cert_timestamps_enabled = 1; - return 1; } void SSL_CTX_enable_ocsp_stapling(SSL_CTX *ctx) { ctx->ocsp_stapling_enabled = 1; } -int SSL_enable_ocsp_stapling(SSL *ssl) { +void SSL_enable_ocsp_stapling(SSL *ssl) { ssl->ocsp_stapling_enabled = 1; - return 1; } void SSL_get0_signed_cert_timestamp_list(const SSL *ssl, const uint8_t **out, @@ -1624,52 +1614,6 @@ void SSL_get0_ocsp_response(const SSL *ssl, const uint8_t **out, *out_len = session->ocsp_response_length; } -int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list, - size_t list_len) { - CBS sct_list; - CBS_init(&sct_list, list, list_len); - if (!ssl_is_sct_list_valid(&sct_list)) { - OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST); - return 0; - } - - CRYPTO_BUFFER_free(ctx->signed_cert_timestamp_list); - ctx->signed_cert_timestamp_list = CRYPTO_BUFFER_new(CBS_data(&sct_list), - CBS_len(&sct_list), - NULL); - return ctx->signed_cert_timestamp_list != NULL; -} - -int SSL_set_signed_cert_timestamp_list(SSL *ssl, const uint8_t *list, - size_t list_len) { - CBS sct_list; - CBS_init(&sct_list, list, list_len); - if (!ssl_is_sct_list_valid(&sct_list)) { - OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST); - return 0; - } - - CRYPTO_BUFFER_free(ssl->signed_cert_timestamp_list); - ssl->signed_cert_timestamp_list = CRYPTO_BUFFER_new(CBS_data(&sct_list), - CBS_len(&sct_list), - NULL); - return ssl->signed_cert_timestamp_list != NULL; -} - -int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response, - size_t response_len) { - CRYPTO_BUFFER_free(ctx->ocsp_response); - ctx->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL); - return ctx->ocsp_response != NULL; -} - -int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response, - size_t response_len) { - CRYPTO_BUFFER_free(ssl->ocsp_response); - ssl->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL); - return ssl->ocsp_response != NULL; -} - int SSL_set_tlsext_host_name(SSL *ssl, const char *name) { OPENSSL_free(ssl->tlsext_hostname); ssl->tlsext_hostname = NULL; @@ -2076,10 +2020,6 @@ SSL_CTX *SSL_set_SSL_CTX(SSL *ssl, SSL_CTX *ctx) { SSL_CTX_free(ssl->ctx); ssl->ctx = ctx; - ssl->sid_ctx_length = ctx->sid_ctx_length; - assert(ssl->sid_ctx_length <= sizeof(ssl->sid_ctx)); - OPENSSL_memcpy(ssl->sid_ctx, ctx->sid_ctx, sizeof(ssl->sid_ctx)); - return ssl->ctx; } @@ -2094,12 +2034,7 @@ void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl, int type, } int SSL_state(const SSL *ssl) { - if (ssl->s3->hs == NULL) { - assert(ssl->s3->initial_handshake_complete); - return SSL_ST_OK; - } - - return ssl->s3->hs->state; + return SSL_in_init(ssl) ? SSL_ST_INIT : SSL_ST_OK; } void SSL_set_state(SSL *ssl, int state) { } @@ -2345,11 +2280,12 @@ int ssl_log_secret(const SSL *ssl, const char *label, const uint8_t *secret, } int SSL_is_init_finished(const SSL *ssl) { - return SSL_state(ssl) == SSL_ST_OK; + return !SSL_in_init(ssl); } int SSL_in_init(const SSL *ssl) { - return (SSL_state(ssl) & SSL_ST_INIT) != 0; + SSL_HANDSHAKE *hs = ssl->s3->hs; + return hs != NULL && hs->state != SSL_ST_OK; } int SSL_in_false_start(const SSL *ssl) { @@ -2575,10 +2511,11 @@ size_t SSL_get_server_random(const SSL *ssl, uint8_t *out, size_t max_out) { } const SSL_CIPHER *SSL_get_pending_cipher(const SSL *ssl) { - if (!SSL_in_init(ssl)) { + SSL_HANDSHAKE *hs = ssl->s3->hs; + if (hs == NULL) { return NULL; } - return ssl->s3->tmp.new_cipher; + return hs->new_cipher; } void SSL_set_retain_only_sha256_of_client_certs(SSL *ssl, int enabled) { diff --git a/src/ssl/ssl_session.c b/src/ssl/ssl_session.c index b71b994c..bbe88c36 100644 --- a/src/ssl/ssl_session.c +++ b/src/ssl/ssl_session.c @@ -280,6 +280,15 @@ SSL_SESSION *SSL_SESSION_dup(SSL_SESSION *session, int dup_flags) { new_session->ticket_age_add = session->ticket_age_add; new_session->ticket_max_early_data = session->ticket_max_early_data; new_session->extended_master_secret = session->extended_master_secret; + + if (session->early_alpn != NULL) { + new_session->early_alpn = + BUF_memdup(session->early_alpn, session->early_alpn_len); + if (new_session->early_alpn == NULL) { + goto err; + } + } + new_session->early_alpn_len = session->early_alpn_len; } /* Copy the ticket. */ @@ -373,6 +382,7 @@ void SSL_SESSION_free(SSL_SESSION *session) { OPENSSL_free(session->tlsext_signed_cert_timestamp_list); OPENSSL_free(session->ocsp_response); OPENSSL_free(session->psk_identity); + OPENSSL_free(session->early_alpn); OPENSSL_cleanse(session, sizeof(*session)); OPENSSL_free(session); } @@ -458,8 +468,8 @@ SSL_SESSION *SSL_get_session(const SSL *ssl) { if (!SSL_in_init(ssl)) { return ssl->s3->established_session; } - if (ssl->s3->new_session != NULL) { - return ssl->s3->new_session; + if (ssl->s3->hs->new_session != NULL) { + return ssl->s3->hs->new_session; } return ssl->session; } @@ -550,19 +560,20 @@ int ssl_get_new_session(SSL_HANDSHAKE *hs, int is_server) { session->session_id_length = 0; } - if (ssl->sid_ctx_length > sizeof(session->sid_ctx)) { + if (ssl->cert->sid_ctx_length > sizeof(session->sid_ctx)) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); goto err; } - OPENSSL_memcpy(session->sid_ctx, ssl->sid_ctx, ssl->sid_ctx_length); - session->sid_ctx_length = ssl->sid_ctx_length; + OPENSSL_memcpy(session->sid_ctx, ssl->cert->sid_ctx, + ssl->cert->sid_ctx_length); + session->sid_ctx_length = ssl->cert->sid_ctx_length; /* The session is marked not resumable until it is completely filled in. */ session->not_resumable = 1; session->verify_result = X509_V_ERR_INVALID_CALL; - SSL_SESSION_free(ssl->s3->new_session); - ssl->s3->new_session = session; + SSL_SESSION_free(hs->new_session); + hs->new_session = session; ssl_set_session(ssl, NULL); return 1; @@ -668,9 +679,9 @@ int ssl_session_is_context_valid(const SSL *ssl, const SSL_SESSION *session) { return 0; } - return session->sid_ctx_length == ssl->sid_ctx_length && - OPENSSL_memcmp(session->sid_ctx, ssl->sid_ctx, ssl->sid_ctx_length) == - 0; + return session->sid_ctx_length == ssl->cert->sid_ctx_length && + OPENSSL_memcmp(session->sid_ctx, ssl->cert->sid_ctx, + ssl->cert->sid_ctx_length) == 0; } int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session) { @@ -689,18 +700,20 @@ int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session) { return session->timeout > (long)now.tv_sec - session->time; } -int ssl_session_is_resumable(const SSL *ssl, const SSL_SESSION *session) { +int ssl_session_is_resumable(const SSL_HANDSHAKE *hs, + const SSL_SESSION *session) { + const SSL *const ssl = hs->ssl; return ssl_session_is_context_valid(ssl, session) && /* The session must have been created by the same type of end point as * we're now using it with. */ - session->is_server == ssl->server && + ssl->server == session->is_server && /* The session must not be expired. */ ssl_session_is_time_valid(ssl, session) && /* Only resume if the session's version matches the negotiated * version. */ ssl->version == session->ssl_version && /* Only resume if the session's cipher matches the negotiated one. */ - ssl->s3->tmp.new_cipher == session->cipher && + hs->new_cipher == session->cipher && /* If the session contains a client certificate (either the full * certificate or just the hash) then require that the form of the * certificate matches the current configuration. */ @@ -898,7 +911,9 @@ static int remove_session_lock(SSL_CTX *ctx, SSL_SESSION *session, int lock) { int SSL_set_session(SSL *ssl, SSL_SESSION *session) { /* SSL_set_session may only be called before the handshake has started. */ - if (SSL_state(ssl) != SSL_ST_INIT || ssl->s3->initial_handshake_complete) { + if (ssl->s3->initial_handshake_complete || + ssl->s3->hs == NULL || + ssl->s3->hs->state != SSL_ST_INIT) { abort(); } diff --git a/src/ssl/ssl_stat.c b/src/ssl/ssl_stat.c index 479288a2..571b4a9a 100644 --- a/src/ssl/ssl_stat.c +++ b/src/ssl/ssl_stat.c @@ -83,11 +83,22 @@ #include <openssl/ssl.h> +#include <assert.h> + #include "internal.h" +static int ssl_state(const SSL *ssl) { + if (ssl->s3->hs == NULL) { + assert(ssl->s3->initial_handshake_complete); + return SSL_ST_OK; + } + + return ssl->s3->hs->state; +} + const char *SSL_state_string_long(const SSL *ssl) { - switch (SSL_state(ssl)) { + switch (ssl_state(ssl)) { case SSL_ST_ACCEPT: return "before accept initialization"; @@ -203,7 +214,7 @@ const char *SSL_state_string_long(const SSL *ssl) { } const char *SSL_state_string(const SSL *ssl) { - switch (SSL_state(ssl)) { + switch (ssl_state(ssl)) { case SSL_ST_ACCEPT: return "AINIT "; diff --git a/src/ssl/ssl_test.cc b/src/ssl/ssl_test.cc index dfab9769..4e0c2747 100644 --- a/src/ssl/ssl_test.cc +++ b/src/ssl/ssl_test.cc @@ -2415,6 +2415,9 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method, // Test that switching the |SSL_CTX| at the SNI callback behaves correctly. static const uint16_t kECDSAWithSHA256 = SSL_SIGN_ECDSA_SECP256R1_SHA256; + static const uint8_t kSCTList[] = {0, 6, 0, 4, 5, 6, 7, 8}; + static const uint8_t kOCSPResponse[] = {1, 2, 3, 4}; + bssl::UniquePtr<SSL_CTX> server_ctx(SSL_CTX_new(method)); bssl::UniquePtr<SSL_CTX> server_ctx2(SSL_CTX_new(method)); bssl::UniquePtr<SSL_CTX> client_ctx(SSL_CTX_new(method)); @@ -2423,6 +2426,10 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method, !SSL_CTX_use_PrivateKey(server_ctx.get(), key.get()) || !SSL_CTX_use_certificate(server_ctx2.get(), cert2.get()) || !SSL_CTX_use_PrivateKey(server_ctx2.get(), key2.get()) || + !SSL_CTX_set_signed_cert_timestamp_list(server_ctx2.get(), kSCTList, + sizeof(kSCTList)) || + !SSL_CTX_set_ocsp_response(server_ctx2.get(), kOCSPResponse, + sizeof(kOCSPResponse)) || // Historically signing preferences would be lost in some cases with the // SNI callback, which triggers the TLS 1.2 SHA-1 default. To ensure // this doesn't happen when |version| is TLS 1.2, configure the private @@ -2441,6 +2448,9 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method, SSL_CTX_set_tlsext_servername_callback(server_ctx.get(), SwitchContext); SSL_CTX_set_tlsext_servername_arg(server_ctx.get(), server_ctx2.get()); + SSL_CTX_enable_signed_cert_timestamps(client_ctx.get()); + SSL_CTX_enable_ocsp_stapling(client_ctx.get()); + bssl::UniquePtr<SSL> client, server; if (!ConnectClientAndServer(&client, &server, client_ctx.get(), server_ctx.get(), nullptr)) { @@ -2455,6 +2465,22 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method, return false; } + // The client should have received |server_ctx2|'s SCT list. + const uint8_t *data; + size_t len; + SSL_get0_signed_cert_timestamp_list(client.get(), &data, &len); + if (Bytes(kSCTList) != Bytes(data, len)) { + fprintf(stderr, "Incorrect SCT list received.\n"); + return false; + } + + // The client should have received |server_ctx2|'s OCSP response. + SSL_get0_ocsp_response(client.get(), &data, &len); + if (Bytes(kOCSPResponse) != Bytes(data, len)) { + fprintf(stderr, "Incorrect OCSP response received.\n"); + return false; + } + return true; } diff --git a/src/ssl/t1_enc.c b/src/ssl/t1_enc.c index d01992e7..9f11e056 100644 --- a/src/ssl/t1_enc.c +++ b/src/ssl/t1_enc.c @@ -330,8 +330,8 @@ static int tls1_setup_key_block(SSL_HANDSHAKE *hs) { } SSL_SESSION *session = ssl->session; - if (ssl->s3->new_session != NULL) { - session = ssl->s3->new_session; + if (hs->new_session != NULL) { + session = hs->new_session; } const EVP_AEAD *aead = NULL; @@ -427,10 +427,9 @@ int tls1_change_cipher_state(SSL_HANDSHAKE *hs, int which) { iv = server_write_iv; } - SSL_AEAD_CTX *aead_ctx = - SSL_AEAD_CTX_new(is_read ? evp_aead_open : evp_aead_seal, - ssl3_protocol_version(ssl), ssl->s3->tmp.new_cipher, key, - key_len, mac_secret, mac_secret_len, iv, iv_len); + SSL_AEAD_CTX *aead_ctx = SSL_AEAD_CTX_new( + is_read ? evp_aead_open : evp_aead_seal, ssl3_protocol_version(ssl), + hs->new_cipher, key, key_len, mac_secret, mac_secret_len, iv, iv_len); if (aead_ctx == NULL) { return 0; } @@ -474,7 +473,7 @@ int tls1_generate_master_secret(SSL_HANDSHAKE *hs, uint8_t *out, const uint8_t *premaster, size_t premaster_len) { const SSL *ssl = hs->ssl; - if (ssl->s3->tmp.extended_master_secret) { + if (hs->extended_master_secret) { uint8_t digests[EVP_MAX_MD_SIZE]; size_t digests_len; if (!SSL_TRANSCRIPT_get_hash(&hs->transcript, digests, &digests_len) || diff --git a/src/ssl/t1_lib.c b/src/ssl/t1_lib.c index 7723ccd3..d6ef1ffd 100644 --- a/src/ssl/t1_lib.c +++ b/src/ssl/t1_lib.c @@ -616,9 +616,9 @@ static int ext_sni_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert, assert(ssl->tlsext_hostname != NULL); if (ssl->session == NULL) { - OPENSSL_free(ssl->s3->new_session->tlsext_hostname); - ssl->s3->new_session->tlsext_hostname = BUF_strdup(ssl->tlsext_hostname); - if (!ssl->s3->new_session->tlsext_hostname) { + OPENSSL_free(hs->new_session->tlsext_hostname); + hs->new_session->tlsext_hostname = BUF_strdup(ssl->tlsext_hostname); + if (!hs->new_session->tlsext_hostname) { *out_alert = SSL_AD_INTERNAL_ERROR; return 0; } @@ -870,38 +870,32 @@ static int ext_ems_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { static int ext_ems_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert, CBS *contents) { SSL *const ssl = hs->ssl; - /* Whether EMS is negotiated may not change on renegotation. */ - if (ssl->s3->initial_handshake_complete) { - if ((contents != NULL) != ssl->s3->tmp.extended_master_secret) { - OPENSSL_PUT_ERROR(SSL, SSL_R_RENEGOTIATION_EMS_MISMATCH); - *out_alert = SSL_AD_ILLEGAL_PARAMETER; + + if (contents != NULL) { + if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION || + ssl->version == SSL3_VERSION || + CBS_len(contents) != 0) { return 0; } - return 1; - } - - if (contents == NULL) { - return 1; - } - - if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION || - ssl->version == SSL3_VERSION) { - return 0; + hs->extended_master_secret = 1; } - if (CBS_len(contents) != 0) { + /* Whether EMS is negotiated may not change on renegotiation. */ + if (ssl->s3->established_session != NULL && + hs->extended_master_secret != + ssl->s3->established_session->extended_master_secret) { + OPENSSL_PUT_ERROR(SSL, SSL_R_RENEGOTIATION_EMS_MISMATCH); + *out_alert = SSL_AD_ILLEGAL_PARAMETER; return 0; } - ssl->s3->tmp.extended_master_secret = 1; return 1; } static int ext_ems_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert, CBS *contents) { - SSL *const ssl = hs->ssl; - uint16_t version = ssl3_protocol_version(ssl); + uint16_t version = ssl3_protocol_version(hs->ssl); if (version >= TLS1_3_VERSION || version == SSL3_VERSION) { return 1; @@ -915,12 +909,12 @@ static int ext_ems_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert, return 0; } - ssl->s3->tmp.extended_master_secret = 1; + hs->extended_master_secret = 1; return 1; } static int ext_ems_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) { - if (!hs->ssl->s3->tmp.extended_master_secret) { + if (!hs->extended_master_secret) { return 1; } @@ -1118,7 +1112,7 @@ static int ext_ocsp_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert, /* OCSP stapling is forbidden on non-certificate ciphers. */ if (CBS_len(contents) != 0 || - !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + !ssl_cipher_uses_certificate_auth(hs->new_cipher)) { return 0; } @@ -1152,9 +1146,9 @@ static int ext_ocsp_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) { SSL *const ssl = hs->ssl; if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION || !hs->ocsp_stapling_requested || - ssl->ocsp_response == NULL || + ssl->cert->ocsp_response == NULL || ssl->s3->session_reused || - !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) { + !ssl_cipher_uses_certificate_auth(hs->new_cipher)) { return 1; } @@ -1341,10 +1335,8 @@ static int ext_sct_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert, * * TODO(davidben): Enforce this anyway. */ if (!ssl->s3->session_reused && - !CBS_stow( - contents, - &ssl->s3->new_session->tlsext_signed_cert_timestamp_list, - &ssl->s3->new_session->tlsext_signed_cert_timestamp_list_length)) { + !CBS_stow(contents, &hs->new_session->tlsext_signed_cert_timestamp_list, + &hs->new_session->tlsext_signed_cert_timestamp_list_length)) { *out_alert = SSL_AD_INTERNAL_ERROR; return 0; } @@ -1371,16 +1363,17 @@ static int ext_sct_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) { /* The extension shouldn't be sent when resuming sessions. */ if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION || ssl->s3->session_reused || - ssl->signed_cert_timestamp_list == NULL) { + ssl->cert->signed_cert_timestamp_list == NULL) { return 1; } CBB contents; return CBB_add_u16(out, TLSEXT_TYPE_certificate_timestamp) && CBB_add_u16_length_prefixed(out, &contents) && - CBB_add_bytes(&contents, - CRYPTO_BUFFER_data(ssl->signed_cert_timestamp_list), - CRYPTO_BUFFER_len(ssl->signed_cert_timestamp_list)) && + CBB_add_bytes( + &contents, + CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list), + CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) && CBB_flush(out); } @@ -1852,8 +1845,8 @@ static int ext_ec_point_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) { return 1; } - const uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey; - const uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth; + const uint32_t alg_k = hs->new_cipher->algorithm_mkey; + const uint32_t alg_a = hs->new_cipher->algorithm_auth; const int using_ecc = (alg_k & SSL_kECDHE) || (alg_a & SSL_aECDSA); if (!using_ecc) { @@ -2218,7 +2211,6 @@ static int ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { int ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t **out_secret, size_t *out_secret_len, uint8_t *out_alert, CBS *contents) { - SSL *const ssl = hs->ssl; CBS peer_key; uint16_t group_id; if (!CBS_get_u16(contents, &group_id) || @@ -2240,7 +2232,7 @@ int ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t **out_secret, return 0; } - ssl->s3->new_session->group_id = group_id; + hs->new_session->group_id = group_id; SSL_ECDH_CTX_cleanup(&hs->ecdh_ctx); return 1; } @@ -2322,7 +2314,6 @@ int ssl_ext_key_share_parse_clienthello(SSL_HANDSHAKE *hs, int *out_found, } int ssl_ext_key_share_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) { - SSL *const ssl = hs->ssl; uint16_t group_id; CBB kse_bytes, public_key; if (!tls1_get_shared_group(hs, &group_id) || @@ -2339,7 +2330,7 @@ int ssl_ext_key_share_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) { hs->public_key = NULL; hs->public_key_len = 0; - ssl->s3->new_session->group_id = group_id; + hs->new_session->group_id = group_id; return 1; } @@ -3518,7 +3509,7 @@ int tls1_channel_id_hash(SSL_HANDSHAKE *hs, uint8_t *out, size_t *out_len) { } /* tls1_record_handshake_hashes_for_channel_id records the current handshake - * hashes in |ssl->s3->new_session| so that Channel ID resumptions can sign that + * hashes in |hs->new_session| so that Channel ID resumptions can sign that * data. */ int tls1_record_handshake_hashes_for_channel_id(SSL_HANDSHAKE *hs) { SSL *const ssl = hs->ssl; @@ -3530,18 +3521,18 @@ int tls1_record_handshake_hashes_for_channel_id(SSL_HANDSHAKE *hs) { } OPENSSL_COMPILE_ASSERT( - sizeof(ssl->s3->new_session->original_handshake_hash) == EVP_MAX_MD_SIZE, + sizeof(hs->new_session->original_handshake_hash) == EVP_MAX_MD_SIZE, original_handshake_hash_is_too_small); size_t digest_len; if (!SSL_TRANSCRIPT_get_hash(&hs->transcript, - ssl->s3->new_session->original_handshake_hash, + hs->new_session->original_handshake_hash, &digest_len)) { return -1; } OPENSSL_COMPILE_ASSERT(EVP_MAX_MD_SIZE <= 0xff, max_md_size_is_too_large); - ssl->s3->new_session->original_handshake_hash_len = (uint8_t)digest_len; + hs->new_session->original_handshake_hash_len = (uint8_t)digest_len; return 1; } diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc index 381f4c2f..dd61ffb4 100644 --- a/src/ssl/test/bssl_shim.cc +++ b/src/ssl/test/bssl_shim.cc @@ -1584,13 +1584,11 @@ static bool DoExchange(bssl::UniquePtr<SSL_SESSION> *out_session, !SSL_set_srtp_profiles(ssl.get(), config->srtp_profiles.c_str())) { return false; } - if (config->enable_ocsp_stapling && - !SSL_enable_ocsp_stapling(ssl.get())) { - return false; + if (config->enable_ocsp_stapling) { + SSL_enable_ocsp_stapling(ssl.get()); } - if (config->enable_signed_cert_timestamps && - !SSL_enable_signed_cert_timestamps(ssl.get())) { - return false; + if (config->enable_signed_cert_timestamps) { + SSL_enable_signed_cert_timestamps(ssl.get()); } if (config->min_version != 0 && !SSL_set_min_proto_version(ssl.get(), (uint16_t)config->min_version)) { diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go index d6e984a0..d7bad5bf 100644 --- a/src/ssl/test/runner/runner.go +++ b/src/ssl/test/runner/runner.go @@ -6395,7 +6395,7 @@ func addRenegotiationTests() { // this case. https://crbug.com/boringssl/130 }) - // Stray HelloRequests during the handshake are ignored in TLS 1.2. + // We reject stray HelloRequests during the handshake in TLS 1.2. testCases = append(testCases, testCase{ name: "StrayHelloRequest", config: Config{ @@ -6404,6 +6404,8 @@ func addRenegotiationTests() { SendHelloRequestBeforeEveryHandshakeMessage: true, }, }, + shouldFail: true, + expectedError: ":UNEXPECTED_MESSAGE:", }) testCases = append(testCases, testCase{ name: "StrayHelloRequest-Packed", @@ -6414,6 +6416,8 @@ func addRenegotiationTests() { SendHelloRequestBeforeEveryHandshakeMessage: true, }, }, + shouldFail: true, + expectedError: ":UNEXPECTED_MESSAGE:", }) // Test renegotiation works if HelloRequest and server Finished come in diff --git a/src/ssl/tls13_both.c b/src/ssl/tls13_both.c index 19dd555b..91cae9ad 100644 --- a/src/ssl/tls13_both.c +++ b/src/ssl/tls13_both.c @@ -211,7 +211,7 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) { if (retain_sha256) { /* Retain the hash of the leaf certificate if requested. */ SHA256(CBS_data(&certificate), CBS_len(&certificate), - ssl->s3->new_session->peer_sha256); + hs->new_session->peer_sha256); } } @@ -262,8 +262,8 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) { } if (sk_CRYPTO_BUFFER_num(certs) == 1 && - !CBS_stow(&ocsp_response, &ssl->s3->new_session->ocsp_response, - &ssl->s3->new_session->ocsp_response_length)) { + !CBS_stow(&ocsp_response, &hs->new_session->ocsp_response, + &hs->new_session->ocsp_response_length)) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); goto err; } @@ -283,10 +283,9 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) { } if (sk_CRYPTO_BUFFER_num(certs) == 1 && - !CBS_stow(&sct, - &ssl->s3->new_session->tlsext_signed_cert_timestamp_list, - &ssl->s3->new_session - ->tlsext_signed_cert_timestamp_list_length)) { + !CBS_stow( + &sct, &hs->new_session->tlsext_signed_cert_timestamp_list, + &hs->new_session->tlsext_signed_cert_timestamp_list_length)) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); goto err; } @@ -303,17 +302,17 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) { hs->peer_pubkey = pkey; pkey = NULL; - sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free); - ssl->s3->new_session->certs = certs; + sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free); + hs->new_session->certs = certs; certs = NULL; - if (!ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) { + if (!ssl->ctx->x509_method->session_cache_objects(hs->new_session)) { OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR); goto err; } - if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) { + if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) { if (!allow_anonymous) { OPENSSL_PUT_ERROR(SSL, SSL_R_PEER_DID_NOT_RETURN_A_CERTIFICATE); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_CERTIFICATE_REQUIRED); @@ -322,17 +321,17 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) { /* OpenSSL returns X509_V_OK when no certificates are requested. This is * classed by them as a bug, but it's assumed by at least NGINX. */ - ssl->s3->new_session->verify_result = X509_V_OK; + hs->new_session->verify_result = X509_V_OK; /* No certificate, so nothing more to do. */ ret = 1; goto err; } - ssl->s3->new_session->peer_sha256_valid = retain_sha256; + hs->new_session->peer_sha256_valid = retain_sha256; - if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result, - ssl->s3->new_session->x509_chain)) { + if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result, + hs->new_session->x509_chain)) { goto err; } @@ -370,7 +369,7 @@ int tls13_process_certificate_verify(SSL_HANDSHAKE *hs) { ssl3_send_alert(ssl, SSL3_AL_FATAL, al); goto err; } - ssl->s3->new_session->peer_signature_algorithm = signature_algorithm; + hs->new_session->peer_signature_algorithm = signature_algorithm; if (!tls13_get_cert_verify_signature_input( hs, &msg, &msg_len, @@ -452,13 +451,14 @@ int tls13_add_certificate(SSL_HANDSHAKE *hs) { goto err; } - if (hs->scts_requested && ssl->signed_cert_timestamp_list != NULL) { + if (hs->scts_requested && ssl->cert->signed_cert_timestamp_list != NULL) { CBB contents; if (!CBB_add_u16(&extensions, TLSEXT_TYPE_certificate_timestamp) || !CBB_add_u16_length_prefixed(&extensions, &contents) || - !CBB_add_bytes(&contents, - CRYPTO_BUFFER_data(ssl->signed_cert_timestamp_list), - CRYPTO_BUFFER_len(ssl->signed_cert_timestamp_list)) || + !CBB_add_bytes( + &contents, + CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list), + CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) || !CBB_flush(&extensions)) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); goto err; @@ -466,14 +466,15 @@ int tls13_add_certificate(SSL_HANDSHAKE *hs) { } if (hs->ocsp_stapling_requested && - ssl->ocsp_response != NULL) { + ssl->cert->ocsp_response != NULL) { CBB contents, ocsp_response; if (!CBB_add_u16(&extensions, TLSEXT_TYPE_status_request) || !CBB_add_u16_length_prefixed(&extensions, &contents) || !CBB_add_u8(&contents, TLSEXT_STATUSTYPE_ocsp) || !CBB_add_u24_length_prefixed(&contents, &ocsp_response) || - !CBB_add_bytes(&ocsp_response, CRYPTO_BUFFER_data(ssl->ocsp_response), - CRYPTO_BUFFER_len(ssl->ocsp_response)) || + !CBB_add_bytes(&ocsp_response, + CRYPTO_BUFFER_data(ssl->cert->ocsp_response), + CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) || !CBB_flush(&extensions)) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); goto err; diff --git a/src/ssl/tls13_client.c b/src/ssl/tls13_client.c index 50f7e5a6..8e994e58 100644 --- a/src/ssl/tls13_client.c +++ b/src/ssl/tls13_client.c @@ -251,24 +251,34 @@ static enum ssl_hs_wait_t do_process_server_hello(SSL_HANDSHAKE *hs) { ssl->s3->session_reused = 1; /* Only authentication information carries over in TLS 1.3. */ - ssl->s3->new_session = - SSL_SESSION_dup(ssl->session, SSL_SESSION_DUP_AUTH_ONLY); - if (ssl->s3->new_session == NULL) { + hs->new_session = SSL_SESSION_dup(ssl->session, SSL_SESSION_DUP_AUTH_ONLY); + if (hs->new_session == NULL) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); return ssl_hs_error; } ssl_set_session(ssl, NULL); /* Resumption incorporates fresh key material, so refresh the timeout. */ - ssl_session_renew_timeout(ssl, ssl->s3->new_session, + ssl_session_renew_timeout(ssl, hs->new_session, ssl->initial_ctx->session_psk_dhe_timeout); } else if (!ssl_get_new_session(hs, 0)) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); return ssl_hs_error; } - ssl->s3->new_session->cipher = cipher; - ssl->s3->tmp.new_cipher = cipher; + hs->new_session->cipher = cipher; + hs->new_cipher = cipher; + + /* Store the initial negotiated ALPN in the session. */ + if (ssl->s3->alpn_selected != NULL) { + hs->new_session->early_alpn = + BUF_memdup(ssl->s3->alpn_selected, ssl->s3->alpn_selected_len); + if (hs->new_session->early_alpn == NULL) { + ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); + return ssl_hs_error; + } + hs->new_session->early_alpn_len = ssl->s3->alpn_selected_len; + } /* The PRF hash is now known. Set up the key schedule. */ if (!tls13_init_key_schedule(hs)) { @@ -277,8 +287,8 @@ static enum ssl_hs_wait_t do_process_server_hello(SSL_HANDSHAKE *hs) { /* Incorporate the PSK into the running secret. */ if (ssl->s3->session_reused) { - if (!tls13_advance_key_schedule(hs, ssl->s3->new_session->master_key, - ssl->s3->new_session->master_key_length)) { + if (!tls13_advance_key_schedule(hs, hs->new_session->master_key, + hs->new_session->master_key_length)) { return ssl_hs_error; } } else if (!tls13_advance_key_schedule(hs, kZeroes, hs->hash_len)) { diff --git a/src/ssl/tls13_enc.c b/src/ssl/tls13_enc.c index 4d140e3c..412705da 100644 --- a/src/ssl/tls13_enc.c +++ b/src/ssl/tls13_enc.c @@ -30,7 +30,7 @@ int tls13_init_key_schedule(SSL_HANDSHAKE *hs) { if (!SSL_TRANSCRIPT_init_hash(&hs->transcript, ssl3_protocol_version(hs->ssl), - hs->ssl->s3->tmp.new_cipher->algorithm_prf)) { + hs->new_cipher->algorithm_prf)) { return 0; } @@ -237,17 +237,15 @@ int tls13_rotate_traffic_key(SSL *ssl, enum evp_aead_direction_t direction) { static const char kTLS13LabelResumption[] = "resumption master secret"; int tls13_derive_resumption_secret(SSL_HANDSHAKE *hs) { - SSL *const ssl = hs->ssl; - if (ssl->s3->hs->hash_len > SSL_MAX_MASTER_KEY_LENGTH) { + if (hs->hash_len > SSL_MAX_MASTER_KEY_LENGTH) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); return 0; } - ssl->s3->new_session->master_key_length = hs->hash_len; - return derive_secret(hs, ssl->s3->new_session->master_key, - ssl->s3->new_session->master_key_length, - (const uint8_t *)kTLS13LabelResumption, - strlen(kTLS13LabelResumption)); + hs->new_session->master_key_length = hs->hash_len; + return derive_secret( + hs, hs->new_session->master_key, hs->new_session->master_key_length, + (const uint8_t *)kTLS13LabelResumption, strlen(kTLS13LabelResumption)); } static const char kTLS13LabelFinished[] = "finished"; diff --git a/src/ssl/tls13_server.c b/src/ssl/tls13_server.c index 0278b500..402c2343 100644 --- a/src/ssl/tls13_server.c +++ b/src/ssl/tls13_server.c @@ -150,8 +150,8 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { } /* Negotiate the cipher suite. */ - ssl->s3->tmp.new_cipher = choose_tls13_cipher(ssl, &client_hello); - if (ssl->s3->tmp.new_cipher == NULL) { + hs->new_cipher = choose_tls13_cipher(ssl, &client_hello); + if (hs->new_cipher == NULL) { OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER); ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); return ssl_hs_error; @@ -189,7 +189,7 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { } if (session != NULL && - !ssl_session_is_resumable(ssl, session)) { + !ssl_session_is_resumable(hs, session)) { SSL_SESSION_free(session); session = NULL; } @@ -202,13 +202,13 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { return ssl_hs_error; } - ssl->s3->new_session->cipher = ssl->s3->tmp.new_cipher; + hs->new_session->cipher = hs->new_cipher; /* On new sessions, stash the SNI value in the session. */ if (hs->hostname != NULL) { - OPENSSL_free(ssl->s3->new_session->tlsext_hostname); - ssl->s3->new_session->tlsext_hostname = BUF_strdup(hs->hostname); - if (ssl->s3->new_session->tlsext_hostname == NULL) { + OPENSSL_free(hs->new_session->tlsext_hostname); + hs->new_session->tlsext_hostname = BUF_strdup(hs->hostname); + if (hs->new_session->tlsext_hostname == NULL) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); return ssl_hs_error; } @@ -222,8 +222,8 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { } /* Only authentication information carries over in TLS 1.3. */ - ssl->s3->new_session = SSL_SESSION_dup(session, SSL_SESSION_DUP_AUTH_ONLY); - if (ssl->s3->new_session == NULL) { + hs->new_session = SSL_SESSION_dup(session, SSL_SESSION_DUP_AUTH_ONLY); + if (hs->new_session == NULL) { ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); return ssl_hs_error; } @@ -231,7 +231,7 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { SSL_SESSION_free(session); /* Resumption incorporates fresh key material, so refresh the timeout. */ - ssl_session_renew_timeout(ssl, ssl->s3->new_session, + ssl_session_renew_timeout(ssl, hs->new_session, ssl->initial_ctx->session_psk_dhe_timeout); } @@ -251,10 +251,21 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { return ssl_hs_error; } + /* Store the initial negotiated ALPN in the session. */ + if (ssl->s3->alpn_selected != NULL) { + hs->new_session->early_alpn = + BUF_memdup(ssl->s3->alpn_selected, ssl->s3->alpn_selected_len); + if (hs->new_session->early_alpn == NULL) { + ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR); + return ssl_hs_error; + } + hs->new_session->early_alpn_len = ssl->s3->alpn_selected_len; + } + /* Incorporate the PSK into the running secret. */ if (ssl->s3->session_reused) { - if (!tls13_advance_key_schedule(hs, ssl->s3->new_session->master_key, - ssl->s3->new_session->master_key_length)) { + if (!tls13_advance_key_schedule(hs, hs->new_session->master_key, + hs->new_session->master_key_length)) { return ssl_hs_error; } } else if (!tls13_advance_key_schedule(hs, kZeroes, hs->hash_len)) { @@ -340,7 +351,7 @@ static enum ssl_hs_wait_t do_send_server_hello(SSL_HANDSHAKE *hs) { !CBB_add_u16(&body, ssl->version) || !RAND_bytes(ssl->s3->server_random, sizeof(ssl->s3->server_random)) || !CBB_add_bytes(&body, ssl->s3->server_random, SSL3_RANDOM_SIZE) || - !CBB_add_u16(&body, ssl_cipher_get_value(ssl->s3->tmp.new_cipher)) || + !CBB_add_u16(&body, ssl_cipher_get_value(hs->new_cipher)) || !CBB_add_u16_length_prefixed(&body, &extensions) || !ssl_ext_pre_shared_key_add_serverhello(hs, &extensions) || !ssl_ext_key_share_add_serverhello(hs, &extensions)) { @@ -472,7 +483,7 @@ static enum ssl_hs_wait_t do_process_client_certificate(SSL_HANDSHAKE *hs) { if (!hs->cert_request) { /* OpenSSL returns X509_V_OK when no certificates are requested. This is * classed by them as a bug, but it's assumed by at least NGINX. */ - ssl->s3->new_session->verify_result = X509_V_OK; + hs->new_session->verify_result = X509_V_OK; /* Skip this state. */ hs->tls13_state = state_process_channel_id; @@ -495,7 +506,7 @@ static enum ssl_hs_wait_t do_process_client_certificate(SSL_HANDSHAKE *hs) { static enum ssl_hs_wait_t do_process_client_certificate_verify( SSL_HANDSHAKE *hs) { SSL *const ssl = hs->ssl; - if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) { + if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) { /* Skip this state. */ hs->tls13_state = state_process_channel_id; return ssl_hs_ok; @@ -543,7 +554,7 @@ static enum ssl_hs_wait_t do_process_client_finished(SSL_HANDSHAKE *hs) { /* Rebase the session timestamp so that it is measured from ticket * issuance. */ - ssl_session_rebase_time(ssl, ssl->s3->new_session); + ssl_session_rebase_time(ssl, hs->new_session); hs->tls13_state = state_send_new_session_ticket; return ssl_hs_ok; } @@ -561,7 +572,7 @@ static enum ssl_hs_wait_t do_send_new_session_ticket(SSL_HANDSHAKE *hs) { return ssl_hs_ok; } - SSL_SESSION *session = ssl->s3->new_session; + SSL_SESSION *session = hs->new_session; CBB cbb; CBB_zero(&cbb); diff --git a/src/tool/transport_common.cc b/src/tool/transport_common.cc index cd3e0d69..5f1a366a 100644 --- a/src/tool/transport_common.cc +++ b/src/tool/transport_common.cc @@ -285,6 +285,11 @@ void PrintConnectionInfo(const SSL *ssl) { size_t ocsp_staple_len; SSL_get0_ocsp_response(ssl, &ocsp_staple, &ocsp_staple_len); fprintf(stderr, " OCSP staple: %s\n", ocsp_staple_len > 0 ? "yes" : "no"); + + const uint8_t *sct_list; + size_t sct_list_len; + SSL_get0_signed_cert_timestamp_list(ssl, &sct_list, &sct_list_len); + fprintf(stderr, " SCT list: %s\n", sct_list_len > 0 ? "yes" : "no"); } // Print the server cert subject and issuer names. diff --git a/src/util/all_tests.json b/src/util/all_tests.json index 76637b2c..fc49c698 100644 --- a/src/util/all_tests.json +++ b/src/util/all_tests.json @@ -5,7 +5,6 @@ ["crypto/bio/bio_test"], ["crypto/bn/bn_test", "crypto/bn/bn_tests.txt"], ["crypto/bytestring/bytestring_test"], - ["crypto/chacha/chacha_test"], ["crypto/cipher/aead_test", "aes-128-gcm", "crypto/cipher/test/aes_128_gcm_tests.txt"], ["crypto/cipher/aead_test", "aes-256-gcm", "crypto/cipher/test/aes_256_gcm_tests.txt"], ["crypto/cipher/aead_test", "aes-128-gcm-siv", "crypto/cipher/test/aes_128_gcm_siv_tests.txt"], @@ -33,7 +32,6 @@ ["crypto/curve25519/x25519_test"], ["crypto/curve25519/spake25519_test"], ["crypto/digest/digest_test"], - ["crypto/ec/ec_test"], ["crypto/ec/example_mul"], ["crypto/ec/p256-x86_64_test", "crypto/ec/p256-x86_64_tests.txt"], ["crypto/ecdh/ecdh_test", "crypto/ecdh/ecdh_tests.txt"], @@ -53,7 +51,6 @@ ["crypto/poly1305/poly1305_test", "crypto/poly1305/poly1305_tests.txt"], ["crypto/pool/pool_test"], ["crypto/refcount_test"], - ["crypto/rsa/rsa_test"], ["crypto/thread_test"], ["crypto/x509/pkcs7_test"], ["crypto/x509/x509_test"], diff --git a/src/util/doc.config b/src/util/doc.config index ddd56db2..f7e8baa1 100644 --- a/src/util/doc.config +++ b/src/util/doc.config @@ -16,8 +16,7 @@ "include/openssl/obj.h", "include/openssl/pool.h", "include/openssl/rand.h", - "include/openssl/stack.h", - "include/openssl/time_support.h" + "include/openssl/stack.h" ] },{ "Name": "Low-level crypto primitives", diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py index 8be7c906..a3435f2b 100644 --- a/src/util/generate_build_files.py +++ b/src/util/generate_build_files.py @@ -50,20 +50,6 @@ NON_PERL_FILES = { ], } -# For now, GTest-based tests are specified manually. Once everything has updated -# to support GTest, these will be determined automatically by looking for files -# ending with _test.cc. -CRYPTO_TEST_SOURCES = [ - 'src/crypto/dh/dh_test.cc', - 'src/crypto/dsa/dsa_test.cc', -] -DECREPIT_TEST_SOURCES = [ - 'src/decrepit/decrepit_test.cc', -] -SSL_TEST_SOURCES = [ - 'src/ssl/ssl_test.cc', -] - PREFIX = None @@ -464,13 +450,6 @@ def OnlyTests(dent, is_dir): non-test sources.""" if is_dir: return dent != 'test' - # For now, GTest-based tests are specified manually. - if dent in [os.path.basename(p) for p in CRYPTO_TEST_SOURCES]: - return False - if dent in [os.path.basename(p) for p in DECREPIT_TEST_SOURCES]: - return False - if dent in [os.path.basename(p) for p in SSL_TEST_SOURCES]: - return False return '_test.' in dent or dent.startswith('example_') @@ -624,6 +603,11 @@ def WriteAsmFiles(perlasms): return asmfiles +def IsGTest(path): + with open(path) as f: + return "#include <gtest/gtest.h>" in f.read() + + def main(platforms): crypto_c_files = FindCFiles(os.path.join('src', 'crypto'), NoTests) ssl_source_files = FindCFiles(os.path.join('src', 'ssl'), NoTests) @@ -643,8 +627,17 @@ def main(platforms): FindHeaderFiles(os.path.join('src', 'crypto', 'test'), AllFiles) + FindHeaderFiles(os.path.join('src', 'ssl', 'test'), AllFiles)) - test_c_files = FindCFiles(os.path.join('src', 'crypto'), OnlyTests) - test_c_files += FindCFiles(os.path.join('src', 'ssl'), OnlyTests) + test_c_files = [] + crypto_test_files = ['src/crypto/test/gtest_main.cc'] + # TODO(davidben): Remove this loop once all tests are converted. + for path in FindCFiles(os.path.join('src', 'crypto'), OnlyTests): + if IsGTest(path): + crypto_test_files.append(path) + else: + test_c_files.append(path) + + ssl_test_files = FindCFiles(os.path.join('src', 'ssl'), OnlyTests) + ssl_test_files.append('src/crypto/test/gtest_main.cc') fuzz_c_files = FindCFiles(os.path.join('src', 'fuzz'), NoTests) @@ -689,15 +682,14 @@ def main(platforms): 'crypto': crypto_c_files, 'crypto_headers': crypto_h_files, 'crypto_internal_headers': crypto_internal_h_files, - 'crypto_test': sorted(CRYPTO_TEST_SOURCES + - ['src/crypto/test/gtest_main.cc']), + 'crypto_test': sorted(crypto_test_files), 'fuzz': fuzz_c_files, 'ssl': ssl_source_files, 'ssl_c': [s for s in ssl_source_files if s.endswith('.c')], 'ssl_cc': [s for s in ssl_source_files if s.endswith('.cc')], 'ssl_headers': ssl_h_files, 'ssl_internal_headers': ssl_internal_h_files, - 'ssl_test': sorted(SSL_TEST_SOURCES + ['src/crypto/test/gtest_main.cc']), + 'ssl_test': sorted(ssl_test_files), 'tool': tool_c_files, 'tool_headers': tool_h_files, 'test': test_c_files, diff --git a/win-x86/crypto/bn/x86-mont.asm b/win-x86/crypto/bn/x86-mont.asm index de7b9499..b1a4d594 100644 --- a/win-x86/crypto/bn/x86-mont.asm +++ b/win-x86/crypto/bn/x86-mont.asm @@ -29,36 +29,51 @@ L$_bn_mul_mont_begin: jl NEAR L$000just_leave lea esi,[20+esp] lea edx,[24+esp] - mov ebp,esp add edi,2 neg edi - lea esp,[edi*4+esp-32] + lea ebp,[edi*4+esp-32] neg edi - mov eax,esp + mov eax,ebp sub eax,edx and eax,2047 - sub esp,eax - xor edx,esp + sub ebp,eax + xor edx,ebp and edx,2048 xor edx,2048 - sub esp,edx - and esp,-64 + sub ebp,edx + and ebp,-64 + mov eax,esp + sub eax,ebp + and eax,-4096 + mov edx,esp + lea esp,[eax*1+ebp] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$001page_walk + jmp NEAR L$002page_walk_done +align 16 +L$001page_walk: + lea esp,[esp-4096] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$001page_walk +L$002page_walk_done: mov eax,DWORD [esi] mov ebx,DWORD [4+esi] mov ecx,DWORD [8+esi] - mov edx,DWORD [12+esi] + mov ebp,DWORD [12+esi] mov esi,DWORD [16+esi] mov esi,DWORD [esi] mov DWORD [4+esp],eax mov DWORD [8+esp],ebx mov DWORD [12+esp],ecx - mov DWORD [16+esp],edx + mov DWORD [16+esp],ebp mov DWORD [20+esp],esi lea ebx,[edi-3] - mov DWORD [24+esp],ebp + mov DWORD [24+esp],edx lea eax,[_OPENSSL_ia32cap_P] bt DWORD [eax],26 - jnc NEAR L$001non_sse2 + jnc NEAR L$003non_sse2 mov eax,-1 movd mm7,eax mov esi,DWORD [8+esp] @@ -82,7 +97,7 @@ L$_bn_mul_mont_begin: psrlq mm3,32 inc ecx align 16 -L$0021st: +L$0041st: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -97,7 +112,7 @@ L$0021st: psrlq mm3,32 lea ecx,[1+ecx] cmp ecx,ebx - jl NEAR L$0021st + jl NEAR L$0041st pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -111,7 +126,7 @@ L$0021st: paddq mm3,mm2 movq [32+ebx*4+esp],mm3 inc edx -L$003outer: +L$005outer: xor ecx,ecx movd mm4,DWORD [edx*4+edi] movd mm5,DWORD [esi] @@ -133,7 +148,7 @@ L$003outer: paddq mm2,mm6 inc ecx dec ebx -L$004inner: +L$006inner: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -150,7 +165,7 @@ L$004inner: paddq mm2,mm6 dec ebx lea ecx,[1+ecx] - jnz NEAR L$004inner + jnz NEAR L$006inner mov ebx,ecx pmuludq mm0,mm4 pmuludq mm1,mm5 @@ -168,11 +183,11 @@ L$004inner: movq [32+ebx*4+esp],mm3 lea edx,[1+edx] cmp edx,ebx - jle NEAR L$003outer + jle NEAR L$005outer emms - jmp NEAR L$005common_tail + jmp NEAR L$007common_tail align 16 -L$001non_sse2: +L$003non_sse2: mov esi,DWORD [8+esp] lea ebp,[1+ebx] mov edi,DWORD [12+esp] @@ -183,12 +198,12 @@ L$001non_sse2: lea eax,[4+ebx*4+edi] or ebp,edx mov edi,DWORD [edi] - jz NEAR L$006bn_sqr_mont + jz NEAR L$008bn_sqr_mont mov DWORD [28+esp],eax mov eax,DWORD [esi] xor edx,edx align 16 -L$007mull: +L$009mull: mov ebp,edx mul edi add ebp,eax @@ -197,7 +212,7 @@ L$007mull: mov eax,DWORD [ecx*4+esi] cmp ecx,ebx mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$007mull + jl NEAR L$009mull mov ebp,edx mul edi mov edi,DWORD [20+esp] @@ -215,9 +230,9 @@ L$007mull: mov eax,DWORD [4+esi] adc edx,0 inc ecx - jmp NEAR L$0082ndmadd + jmp NEAR L$0102ndmadd align 16 -L$0091stmadd: +L$0111stmadd: mov ebp,edx mul edi add ebp,DWORD [32+ecx*4+esp] @@ -228,7 +243,7 @@ L$0091stmadd: adc edx,0 cmp ecx,ebx mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$0091stmadd + jl NEAR L$0111stmadd mov ebp,edx mul edi add eax,DWORD [32+ebx*4+esp] @@ -251,7 +266,7 @@ L$0091stmadd: adc edx,0 mov ecx,1 align 16 -L$0082ndmadd: +L$0102ndmadd: mov ebp,edx mul edi add ebp,DWORD [32+ecx*4+esp] @@ -262,7 +277,7 @@ L$0082ndmadd: adc edx,0 cmp ecx,ebx mov DWORD [24+ecx*4+esp],ebp - jl NEAR L$0082ndmadd + jl NEAR L$0102ndmadd mov ebp,edx mul edi add ebp,DWORD [32+ebx*4+esp] @@ -278,16 +293,16 @@ L$0082ndmadd: mov DWORD [32+ebx*4+esp],edx cmp ecx,DWORD [28+esp] mov DWORD [36+ebx*4+esp],eax - je NEAR L$005common_tail + je NEAR L$007common_tail mov edi,DWORD [ecx] mov esi,DWORD [8+esp] mov DWORD [12+esp],ecx xor ecx,ecx xor edx,edx mov eax,DWORD [esi] - jmp NEAR L$0091stmadd + jmp NEAR L$0111stmadd align 16 -L$006bn_sqr_mont: +L$008bn_sqr_mont: mov DWORD [esp],ebx mov DWORD [12+esp],ecx mov eax,edi @@ -298,7 +313,7 @@ L$006bn_sqr_mont: and ebx,1 inc ecx align 16 -L$010sqr: +L$012sqr: mov eax,DWORD [ecx*4+esi] mov ebp,edx mul edi @@ -310,7 +325,7 @@ L$010sqr: cmp ecx,DWORD [esp] mov ebx,eax mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$010sqr + jl NEAR L$012sqr mov eax,DWORD [ecx*4+esi] mov ebp,edx mul edi @@ -334,7 +349,7 @@ L$010sqr: mov eax,DWORD [4+esi] mov ecx,1 align 16 -L$0113rdmadd: +L$0133rdmadd: mov ebp,edx mul edi add ebp,DWORD [32+ecx*4+esp] @@ -353,7 +368,7 @@ L$0113rdmadd: adc edx,0 cmp ecx,ebx mov DWORD [24+ecx*4+esp],ebp - jl NEAR L$0113rdmadd + jl NEAR L$0133rdmadd mov ebp,edx mul edi add ebp,DWORD [32+ebx*4+esp] @@ -369,7 +384,7 @@ L$0113rdmadd: mov DWORD [32+ebx*4+esp],edx cmp ecx,ebx mov DWORD [36+ebx*4+esp],eax - je NEAR L$005common_tail + je NEAR L$007common_tail mov edi,DWORD [4+ecx*4+esi] lea ecx,[1+ecx] mov eax,edi @@ -381,12 +396,12 @@ L$0113rdmadd: xor ebp,ebp cmp ecx,ebx lea ecx,[1+ecx] - je NEAR L$012sqrlast + je NEAR L$014sqrlast mov ebx,edx shr edx,1 and ebx,1 align 16 -L$013sqradd: +L$015sqradd: mov eax,DWORD [ecx*4+esi] mov ebp,edx mul edi @@ -402,13 +417,13 @@ L$013sqradd: cmp ecx,DWORD [esp] mov DWORD [28+ecx*4+esp],ebp mov ebx,eax - jle NEAR L$013sqradd + jle NEAR L$015sqradd mov ebp,edx add edx,edx shr ebp,31 add edx,ebx adc ebp,0 -L$012sqrlast: +L$014sqrlast: mov edi,DWORD [20+esp] mov esi,DWORD [16+esp] imul edi,DWORD [32+esp] @@ -423,9 +438,9 @@ L$012sqrlast: adc edx,0 mov ecx,1 mov eax,DWORD [4+esi] - jmp NEAR L$0113rdmadd + jmp NEAR L$0133rdmadd align 16 -L$005common_tail: +L$007common_tail: mov ebp,DWORD [16+esp] mov edi,DWORD [4+esp] lea esi,[32+esp] @@ -433,25 +448,26 @@ L$005common_tail: mov ecx,ebx xor edx,edx align 16 -L$014sub: +L$016sub: sbb eax,DWORD [edx*4+ebp] mov DWORD [edx*4+edi],eax dec ecx mov eax,DWORD [4+edx*4+esi] lea edx,[1+edx] - jge NEAR L$014sub + jge NEAR L$016sub sbb eax,0 + and esi,eax + not eax + mov ebp,edi + and ebp,eax + or esi,ebp align 16 -L$015copy: - mov edx,DWORD [ebx*4+esi] - mov ebp,DWORD [ebx*4+edi] - xor edx,ebp - and edx,eax - xor edx,ebp - mov DWORD [ebx*4+esi],ecx - mov DWORD [ebx*4+edi],edx +L$017copy: + mov eax,DWORD [ebx*4+esi] + mov DWORD [ebx*4+edi],eax + mov DWORD [32+ebx*4+esp],ecx dec ebx - jge NEAR L$015copy + jge NEAR L$017copy mov esp,DWORD [24+esp] mov eax,1 L$000just_leave: diff --git a/win-x86_64/crypto/aes/aes-x86_64.asm b/win-x86_64/crypto/aes/aes-x86_64.asm index 53394f0e..3db1846e 100644 --- a/win-x86_64/crypto/aes/aes-x86_64.asm +++ b/win-x86_64/crypto/aes/aes-x86_64.asm @@ -344,6 +344,7 @@ $L$SEH_begin_asm_AES_encrypt: mov rdx,r8 + mov rax,rsp push rbx push rbp push r12 @@ -352,7 +353,6 @@ $L$SEH_begin_asm_AES_encrypt: push r15 - mov r10,rsp lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -362,7 +362,7 @@ $L$SEH_begin_asm_AES_encrypt: sub rsp,32 mov QWORD[16+rsp],rsi - mov QWORD[24+rsp],r10 + mov QWORD[24+rsp],rax $L$enc_prologue: mov r15,rdx @@ -394,13 +394,13 @@ $L$enc_prologue: mov DWORD[8+r9],ecx mov DWORD[12+r9],edx - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -800,6 +800,7 @@ $L$SEH_begin_asm_AES_decrypt: mov rdx,r8 + mov rax,rsp push rbx push rbp push r12 @@ -808,7 +809,6 @@ $L$SEH_begin_asm_AES_decrypt: push r15 - mov r10,rsp lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -818,7 +818,7 @@ $L$SEH_begin_asm_AES_decrypt: sub rsp,32 mov QWORD[16+rsp],rsi - mov QWORD[24+rsp],r10 + mov QWORD[24+rsp],rax $L$dec_prologue: mov r15,rdx @@ -852,13 +852,13 @@ $L$dec_prologue: mov DWORD[8+r9],ecx mov DWORD[12+r9],edx - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1367,10 +1367,9 @@ $L$cbc_prologue: mov r9d,r9d lea r14,[$L$AES_Te] + lea r10,[$L$AES_Td] cmp r9,0 - jne NEAR $L$cbc_picked_te - lea r14,[$L$AES_Td] -$L$cbc_picked_te: + cmove r14,r10 mov r10d,DWORD[OPENSSL_ia32cap_P] cmp rdx,512 @@ -2626,7 +2625,6 @@ block_se_handler: jae NEAR $L$in_block_prologue mov rax,QWORD[24+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/win-x86_64/crypto/aes/aesni-x86_64.asm b/win-x86_64/crypto/aes/aesni-x86_64.asm index cf313d1a..d5d454d9 100644 --- a/win-x86_64/crypto/aes/aesni-x86_64.asm +++ b/win-x86_64/crypto/aes/aesni-x86_64.asm @@ -1129,22 +1129,21 @@ DB 102,15,56,221,209 ALIGN 16 $L$ctr32_bulk: - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,288 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$ctr32_body: - lea rbp,[((-8))+rax] @@ -1153,7 +1152,7 @@ $L$ctr32_body: movdqu xmm0,XMMWORD[rcx] mov r8d,DWORD[12+r8] pxor xmm2,xmm0 - mov r11d,DWORD[12+rcx] + mov ebp,DWORD[12+rcx] movdqa XMMWORD[rsp],xmm2 bswap r8d movdqa xmm3,xmm2 @@ -1169,8 +1168,8 @@ $L$ctr32_body: lea rdx,[2+r8] bswap eax bswap edx - xor eax,r11d - xor edx,r11d + xor eax,ebp + xor edx,ebp DB 102,15,58,34,216,3 lea rax,[3+r8] movdqa XMMWORD[16+rsp],xmm3 @@ -1179,25 +1178,25 @@ DB 102,15,58,34,226,3 mov rdx,r10 lea r10,[4+r8] movdqa XMMWORD[32+rsp],xmm4 - xor eax,r11d + xor eax,ebp bswap r10d DB 102,15,58,34,232,3 - xor r10d,r11d + xor r10d,ebp movdqa XMMWORD[48+rsp],xmm5 lea r9,[5+r8] mov DWORD[((64+12))+rsp],r10d bswap r9d lea r10,[6+r8] mov eax,DWORD[240+rcx] - xor r9d,r11d + xor r9d,ebp bswap r10d mov DWORD[((80+12))+rsp],r9d - xor r10d,r11d + xor r10d,ebp lea r9,[7+r8] mov DWORD[((96+12))+rsp],r10d bswap r9d mov r10d,DWORD[((OPENSSL_ia32cap_P+4))] - xor r9d,r11d + xor r9d,ebp and r10d,71303168 mov DWORD[((112+12))+rsp],r9d @@ -1221,7 +1220,7 @@ ALIGN 16 $L$ctr32_6x: shl eax,4 mov r10d,48 - bswap r11d + bswap ebp lea rcx,[32+rax*1+rcx] sub r10,rax jmp NEAR $L$ctr32_loop6 @@ -1232,32 +1231,32 @@ $L$ctr32_loop6: movups xmm0,XMMWORD[((-48))+r10*1+rcx] DB 102,15,56,220,209 mov eax,r8d - xor eax,r11d + xor eax,ebp DB 102,15,56,220,217 DB 0x0f,0x38,0xf1,0x44,0x24,12 lea eax,[1+r8] DB 102,15,56,220,225 - xor eax,r11d + xor eax,ebp DB 0x0f,0x38,0xf1,0x44,0x24,28 DB 102,15,56,220,233 lea eax,[2+r8] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,241 DB 0x0f,0x38,0xf1,0x44,0x24,44 lea eax,[3+r8] DB 102,15,56,220,249 movups xmm1,XMMWORD[((-32))+r10*1+rcx] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,208 DB 0x0f,0x38,0xf1,0x44,0x24,60 lea eax,[4+r8] DB 102,15,56,220,216 - xor eax,r11d + xor eax,ebp DB 0x0f,0x38,0xf1,0x44,0x24,76 DB 102,15,56,220,224 lea eax,[5+r8] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,232 DB 0x0f,0x38,0xf1,0x44,0x24,92 mov rax,r10 @@ -1318,7 +1317,7 @@ DB 102,15,56,220,217 bswap r9d movups xmm0,XMMWORD[((32-128))+rcx] DB 102,15,56,220,225 - xor r9d,r11d + xor r9d,ebp nop DB 102,15,56,220,233 mov DWORD[((0+12))+rsp],r9d @@ -1331,7 +1330,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1345,7 +1344,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1359,7 +1358,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1373,7 +1372,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1387,7 +1386,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1401,7 +1400,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1416,7 +1415,7 @@ DB 102,68,15,56,220,201 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 - xor r9d,r11d + xor r9d,ebp movdqu xmm10,XMMWORD[rdi] DB 102,15,56,220,232 mov DWORD[((112+12))+rsp],r9d @@ -1651,32 +1650,32 @@ DB 102,15,56,221,225 $L$ctr32_done: xorps xmm0,xmm0 - xor r11d,r11d + xor ebp,ebp pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -1685,8 +1684,8 @@ $L$ctr32_done: movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 movaps XMMWORD[112+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1708,22 +1707,21 @@ $L$SEH_begin_aesni_xts_encrypt: mov r9,QWORD[48+rsp] - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,272 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$xts_enc_body: - lea rbp,[((-8))+rax] movups xmm2,XMMWORD[r9] mov eax,DWORD[240+r8] mov r10d,DWORD[240+rcx] @@ -1739,7 +1737,7 @@ DB 102,15,56,220,209 jnz NEAR $L$oop_enc1_8 DB 102,15,56,221,209 movups xmm0,XMMWORD[rcx] - mov r11,rcx + mov rbp,rcx mov eax,r10d shl r10d,4 mov r9,rdx @@ -1795,9 +1793,9 @@ DB 102,15,56,221,209 jc NEAR $L$xts_enc_short mov eax,16+96 - lea rcx,[32+r10*1+r11] + lea rcx,[32+r10*1+rbp] sub rax,r10 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] mov r10,rax lea r8,[$L$xts_magic] jmp NEAR $L$xts_enc_grandloop @@ -1822,7 +1820,7 @@ DB 102,15,56,220,225 movdqa xmm9,XMMWORD[96+rsp] pxor xmm6,xmm14 DB 102,15,56,220,233 - movups xmm0,XMMWORD[32+r11] + movups xmm0,XMMWORD[32+rbp] lea rdi,[96+rdi] pxor xmm7,xmm8 @@ -1831,7 +1829,7 @@ DB 102,15,56,220,241 pxor xmm11,xmm9 movdqa XMMWORD[rsp],xmm10 DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+r11] + movups xmm1,XMMWORD[48+rbp] pxor xmm12,xmm9 DB 102,15,56,220,208 @@ -1846,7 +1844,7 @@ DB 102,15,56,220,232 movdqa XMMWORD[64+rsp],xmm14 DB 102,15,56,220,240 DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+r11] + movups xmm0,XMMWORD[64+rbp] movdqa XMMWORD[80+rsp],xmm8 pshufd xmm9,xmm15,0x5f jmp NEAR $L$xts_enc_loop6 @@ -1878,7 +1876,7 @@ DB 102,15,56,220,209 psrad xmm14,31 DB 102,15,56,220,217 pand xmm14,xmm8 - movups xmm10,XMMWORD[r11] + movups xmm10,XMMWORD[rbp] DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 @@ -1946,10 +1944,10 @@ DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 pxor xmm15,xmm0 - movups xmm0,XMMWORD[r11] + movups xmm0,XMMWORD[rbp] DB 102,15,56,220,241 DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] pxor xmm14,xmm15 DB 102,15,56,221,84,36,0 @@ -1976,7 +1974,7 @@ DB 102,15,56,221,124,36,80 mov eax,16+96 sub eax,r10d - mov rcx,r11 + mov rcx,rbp shr eax,4 $L$xts_enc_short: @@ -2132,7 +2130,7 @@ $L$xts_enc_steal: jnz NEAR $L$xts_enc_steal sub rsi,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[((-16))+rsi] @@ -2158,26 +2156,26 @@ $L$xts_enc_ret: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -2185,8 +2183,8 @@ $L$xts_enc_ret: movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$xts_enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2208,22 +2206,21 @@ $L$SEH_begin_aesni_xts_decrypt: mov r9,QWORD[48+rsp] - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,272 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$xts_dec_body: - lea rbp,[((-8))+rax] movups xmm2,XMMWORD[r9] mov eax,DWORD[240+r8] mov r10d,DWORD[240+rcx] @@ -2245,7 +2242,7 @@ DB 102,15,56,221,209 sub rdx,rax movups xmm0,XMMWORD[rcx] - mov r11,rcx + mov rbp,rcx mov eax,r10d shl r10d,4 mov r9,rdx @@ -2301,9 +2298,9 @@ DB 102,15,56,221,209 jc NEAR $L$xts_dec_short mov eax,16+96 - lea rcx,[32+r10*1+r11] + lea rcx,[32+r10*1+rbp] sub rax,r10 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] mov r10,rax lea r8,[$L$xts_magic] jmp NEAR $L$xts_dec_grandloop @@ -2328,7 +2325,7 @@ DB 102,15,56,222,225 movdqa xmm9,XMMWORD[96+rsp] pxor xmm6,xmm14 DB 102,15,56,222,233 - movups xmm0,XMMWORD[32+r11] + movups xmm0,XMMWORD[32+rbp] lea rdi,[96+rdi] pxor xmm7,xmm8 @@ -2337,7 +2334,7 @@ DB 102,15,56,222,241 pxor xmm11,xmm9 movdqa XMMWORD[rsp],xmm10 DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+r11] + movups xmm1,XMMWORD[48+rbp] pxor xmm12,xmm9 DB 102,15,56,222,208 @@ -2352,7 +2349,7 @@ DB 102,15,56,222,232 movdqa XMMWORD[64+rsp],xmm14 DB 102,15,56,222,240 DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+r11] + movups xmm0,XMMWORD[64+rbp] movdqa XMMWORD[80+rsp],xmm8 pshufd xmm9,xmm15,0x5f jmp NEAR $L$xts_dec_loop6 @@ -2384,7 +2381,7 @@ DB 102,15,56,222,209 psrad xmm14,31 DB 102,15,56,222,217 pand xmm14,xmm8 - movups xmm10,XMMWORD[r11] + movups xmm10,XMMWORD[rbp] DB 102,15,56,222,225 DB 102,15,56,222,233 DB 102,15,56,222,241 @@ -2452,10 +2449,10 @@ DB 102,15,56,222,217 DB 102,15,56,222,225 DB 102,15,56,222,233 pxor xmm15,xmm0 - movups xmm0,XMMWORD[r11] + movups xmm0,XMMWORD[rbp] DB 102,15,56,222,241 DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] pxor xmm14,xmm15 DB 102,15,56,223,84,36,0 @@ -2482,7 +2479,7 @@ DB 102,15,56,223,124,36,80 mov eax,16+96 sub eax,r10d - mov rcx,r11 + mov rcx,rbp shr eax,4 $L$xts_dec_short: @@ -2639,7 +2636,7 @@ $L$xts_dec_done: jz NEAR $L$xts_dec_ret $L$xts_dec_done2: mov rdx,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[rdi] @@ -2669,7 +2666,7 @@ $L$xts_dec_steal: jnz NEAR $L$xts_dec_steal sub rsi,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[rsi] @@ -2695,26 +2692,26 @@ $L$xts_dec_ret: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -2722,13 +2719,901 @@ $L$xts_dec_ret: movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$xts_dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_aesni_xts_decrypt: +global aesni_ocb_encrypt + +ALIGN 32 +aesni_ocb_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_ocb_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + movaps XMMWORD[rsp],xmm6 + movaps XMMWORD[16+rsp],xmm7 + movaps XMMWORD[32+rsp],xmm8 + movaps XMMWORD[48+rsp],xmm9 + movaps XMMWORD[64+rsp],xmm10 + movaps XMMWORD[80+rsp],xmm11 + movaps XMMWORD[96+rsp],xmm12 + movaps XMMWORD[112+rsp],xmm13 + movaps XMMWORD[128+rsp],xmm14 + movaps XMMWORD[144+rsp],xmm15 +$L$ocb_enc_body: + mov rbx,QWORD[56+rax] + mov rbp,QWORD[((56+8))+rax] + + mov r10d,DWORD[240+rcx] + mov r11,rcx + shl r10d,4 + movups xmm9,XMMWORD[rcx] + movups xmm1,XMMWORD[16+r10*1+rcx] + + movdqu xmm15,XMMWORD[r9] + pxor xmm9,xmm1 + pxor xmm15,xmm1 + + mov eax,16+32 + lea rcx,[32+r10*1+r11] + movups xmm1,XMMWORD[16+r11] + sub rax,r10 + mov r10,rax + + movdqu xmm10,XMMWORD[rbx] + movdqu xmm8,XMMWORD[rbp] + + test r8,1 + jnz NEAR $L$ocb_enc_odd + + bsf r12,r8 + add r8,1 + shl r12,4 + movdqu xmm7,XMMWORD[r12*1+rbx] + movdqu xmm2,XMMWORD[rdi] + lea rdi,[16+rdi] + + call __ocb_encrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + lea rsi,[16+rsi] + sub rdx,1 + jz NEAR $L$ocb_enc_done + +$L$ocb_enc_odd: + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + lea r8,[6+r8] + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + shl r12,4 + shl r13,4 + shl r14,4 + + sub rdx,6 + jc NEAR $L$ocb_enc_short + jmp NEAR $L$ocb_enc_grandloop + +ALIGN 32 +$L$ocb_enc_grandloop: + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqu xmm4,XMMWORD[32+rdi] + movdqu xmm5,XMMWORD[48+rdi] + movdqu xmm6,XMMWORD[64+rdi] + movdqu xmm7,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + call __ocb_encrypt6 + + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + movups XMMWORD[80+rsi],xmm7 + lea rsi,[96+rsi] + sub rdx,6 + jnc NEAR $L$ocb_enc_grandloop + +$L$ocb_enc_short: + add rdx,6 + jz NEAR $L$ocb_enc_done + + movdqu xmm2,XMMWORD[rdi] + cmp rdx,2 + jb NEAR $L$ocb_enc_one + movdqu xmm3,XMMWORD[16+rdi] + je NEAR $L$ocb_enc_two + + movdqu xmm4,XMMWORD[32+rdi] + cmp rdx,4 + jb NEAR $L$ocb_enc_three + movdqu xmm5,XMMWORD[48+rdi] + je NEAR $L$ocb_enc_four + + movdqu xmm6,XMMWORD[64+rdi] + pxor xmm7,xmm7 + + call __ocb_encrypt6 + + movdqa xmm15,xmm14 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_one: + movdqa xmm7,xmm10 + + call __ocb_encrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_two: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + call __ocb_encrypt4 + + movdqa xmm15,xmm11 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_three: + pxor xmm5,xmm5 + + call __ocb_encrypt4 + + movdqa xmm15,xmm12 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_four: + call __ocb_encrypt4 + + movdqa xmm15,xmm13 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + +$L$ocb_enc_done: + pxor xmm15,xmm0 + movdqu XMMWORD[rbp],xmm8 + movdqu XMMWORD[r9],xmm15 + + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[rsp] + movaps XMMWORD[rsp],xmm0 + movaps xmm7,XMMWORD[16+rsp] + movaps XMMWORD[16+rsp],xmm0 + movaps xmm8,XMMWORD[32+rsp] + movaps XMMWORD[32+rsp],xmm0 + movaps xmm9,XMMWORD[48+rsp] + movaps XMMWORD[48+rsp],xmm0 + movaps xmm10,XMMWORD[64+rsp] + movaps XMMWORD[64+rsp],xmm0 + movaps xmm11,XMMWORD[80+rsp] + movaps XMMWORD[80+rsp],xmm0 + movaps xmm12,XMMWORD[96+rsp] + movaps XMMWORD[96+rsp],xmm0 + movaps xmm13,XMMWORD[112+rsp] + movaps XMMWORD[112+rsp],xmm0 + movaps xmm14,XMMWORD[128+rsp] + movaps XMMWORD[128+rsp],xmm0 + movaps xmm15,XMMWORD[144+rsp] + movaps XMMWORD[144+rsp],xmm0 + lea rax,[((160+40))+rsp] +$L$ocb_enc_pop: + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] +$L$ocb_enc_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_aesni_ocb_encrypt: + + +ALIGN 32 +__ocb_encrypt6: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + movdqa xmm14,xmm10 + pxor xmm10,xmm15 + movdqu xmm15,XMMWORD[r14*1+rbx] + pxor xmm11,xmm10 + pxor xmm8,xmm2 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm8,xmm3 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm8,xmm4 + pxor xmm4,xmm12 + pxor xmm14,xmm13 + pxor xmm8,xmm5 + pxor xmm5,xmm13 + pxor xmm15,xmm14 + pxor xmm8,xmm6 + pxor xmm6,xmm14 + pxor xmm8,xmm7 + pxor xmm7,xmm15 + movups xmm0,XMMWORD[32+r11] + + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + add r8,6 + pxor xmm10,xmm9 + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + pxor xmm11,xmm9 + pxor xmm12,xmm9 +DB 102,15,56,220,241 + pxor xmm13,xmm9 + pxor xmm14,xmm9 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[48+r11] + pxor xmm15,xmm9 + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 + movups xmm0,XMMWORD[64+r11] + shl r12,4 + shl r13,4 + jmp NEAR $L$ocb_enc_loop6 + +ALIGN 32 +$L$ocb_enc_loop6: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_enc_loop6 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[16+r11] + shl r14,4 + +DB 102,65,15,56,221,210 + movdqu xmm10,XMMWORD[rbx] + mov rax,r10 +DB 102,65,15,56,221,219 +DB 102,65,15,56,221,228 +DB 102,65,15,56,221,237 +DB 102,65,15,56,221,246 +DB 102,65,15,56,221,255 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_encrypt4: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + pxor xmm10,xmm15 + pxor xmm11,xmm10 + pxor xmm8,xmm2 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm8,xmm3 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm8,xmm4 + pxor xmm4,xmm12 + pxor xmm8,xmm5 + pxor xmm5,xmm13 + movups xmm0,XMMWORD[32+r11] + + pxor xmm10,xmm9 + pxor xmm11,xmm9 + pxor xmm12,xmm9 + pxor xmm13,xmm9 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[48+r11] + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_enc_loop4 + +ALIGN 32 +$L$ocb_enc_loop4: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_enc_loop4 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,65,15,56,221,210 +DB 102,65,15,56,221,219 +DB 102,65,15,56,221,228 +DB 102,65,15,56,221,237 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_encrypt1: + pxor xmm7,xmm15 + pxor xmm7,xmm9 + pxor xmm8,xmm2 + pxor xmm2,xmm7 + movups xmm0,XMMWORD[32+r11] + +DB 102,15,56,220,209 + movups xmm1,XMMWORD[48+r11] + pxor xmm7,xmm9 + +DB 102,15,56,220,208 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_enc_loop1 + +ALIGN 32 +$L$ocb_enc_loop1: +DB 102,15,56,220,209 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,220,208 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_enc_loop1 + +DB 102,15,56,220,209 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,15,56,221,215 + DB 0F3h,0C3h ;repret + + +global aesni_ocb_decrypt + +ALIGN 32 +aesni_ocb_decrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_ocb_decrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + movaps XMMWORD[rsp],xmm6 + movaps XMMWORD[16+rsp],xmm7 + movaps XMMWORD[32+rsp],xmm8 + movaps XMMWORD[48+rsp],xmm9 + movaps XMMWORD[64+rsp],xmm10 + movaps XMMWORD[80+rsp],xmm11 + movaps XMMWORD[96+rsp],xmm12 + movaps XMMWORD[112+rsp],xmm13 + movaps XMMWORD[128+rsp],xmm14 + movaps XMMWORD[144+rsp],xmm15 +$L$ocb_dec_body: + mov rbx,QWORD[56+rax] + mov rbp,QWORD[((56+8))+rax] + + mov r10d,DWORD[240+rcx] + mov r11,rcx + shl r10d,4 + movups xmm9,XMMWORD[rcx] + movups xmm1,XMMWORD[16+r10*1+rcx] + + movdqu xmm15,XMMWORD[r9] + pxor xmm9,xmm1 + pxor xmm15,xmm1 + + mov eax,16+32 + lea rcx,[32+r10*1+r11] + movups xmm1,XMMWORD[16+r11] + sub rax,r10 + mov r10,rax + + movdqu xmm10,XMMWORD[rbx] + movdqu xmm8,XMMWORD[rbp] + + test r8,1 + jnz NEAR $L$ocb_dec_odd + + bsf r12,r8 + add r8,1 + shl r12,4 + movdqu xmm7,XMMWORD[r12*1+rbx] + movdqu xmm2,XMMWORD[rdi] + lea rdi,[16+rdi] + + call __ocb_decrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + lea rsi,[16+rsi] + sub rdx,1 + jz NEAR $L$ocb_dec_done + +$L$ocb_dec_odd: + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + lea r8,[6+r8] + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + shl r12,4 + shl r13,4 + shl r14,4 + + sub rdx,6 + jc NEAR $L$ocb_dec_short + jmp NEAR $L$ocb_dec_grandloop + +ALIGN 32 +$L$ocb_dec_grandloop: + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqu xmm4,XMMWORD[32+rdi] + movdqu xmm5,XMMWORD[48+rdi] + movdqu xmm6,XMMWORD[64+rdi] + movdqu xmm7,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + call __ocb_decrypt6 + + movups XMMWORD[rsi],xmm2 + pxor xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm8,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm8,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm8,xmm6 + movups XMMWORD[80+rsi],xmm7 + pxor xmm8,xmm7 + lea rsi,[96+rsi] + sub rdx,6 + jnc NEAR $L$ocb_dec_grandloop + +$L$ocb_dec_short: + add rdx,6 + jz NEAR $L$ocb_dec_done + + movdqu xmm2,XMMWORD[rdi] + cmp rdx,2 + jb NEAR $L$ocb_dec_one + movdqu xmm3,XMMWORD[16+rdi] + je NEAR $L$ocb_dec_two + + movdqu xmm4,XMMWORD[32+rdi] + cmp rdx,4 + jb NEAR $L$ocb_dec_three + movdqu xmm5,XMMWORD[48+rdi] + je NEAR $L$ocb_dec_four + + movdqu xmm6,XMMWORD[64+rdi] + pxor xmm7,xmm7 + + call __ocb_decrypt6 + + movdqa xmm15,xmm14 + movups XMMWORD[rsi],xmm2 + pxor xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm8,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm8,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm8,xmm6 + + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_one: + movdqa xmm7,xmm10 + + call __ocb_decrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_two: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + call __ocb_decrypt4 + + movdqa xmm15,xmm11 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + xorps xmm8,xmm3 + + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_three: + pxor xmm5,xmm5 + + call __ocb_decrypt4 + + movdqa xmm15,xmm12 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + xorps xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + xorps xmm8,xmm4 + + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_four: + call __ocb_decrypt4 + + movdqa xmm15,xmm13 + movups XMMWORD[rsi],xmm2 + pxor xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm8,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm8,xmm5 + +$L$ocb_dec_done: + pxor xmm15,xmm0 + movdqu XMMWORD[rbp],xmm8 + movdqu XMMWORD[r9],xmm15 + + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[rsp] + movaps XMMWORD[rsp],xmm0 + movaps xmm7,XMMWORD[16+rsp] + movaps XMMWORD[16+rsp],xmm0 + movaps xmm8,XMMWORD[32+rsp] + movaps XMMWORD[32+rsp],xmm0 + movaps xmm9,XMMWORD[48+rsp] + movaps XMMWORD[48+rsp],xmm0 + movaps xmm10,XMMWORD[64+rsp] + movaps XMMWORD[64+rsp],xmm0 + movaps xmm11,XMMWORD[80+rsp] + movaps XMMWORD[80+rsp],xmm0 + movaps xmm12,XMMWORD[96+rsp] + movaps XMMWORD[96+rsp],xmm0 + movaps xmm13,XMMWORD[112+rsp] + movaps XMMWORD[112+rsp],xmm0 + movaps xmm14,XMMWORD[128+rsp] + movaps XMMWORD[128+rsp],xmm0 + movaps xmm15,XMMWORD[144+rsp] + movaps XMMWORD[144+rsp],xmm0 + lea rax,[((160+40))+rsp] +$L$ocb_dec_pop: + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] +$L$ocb_dec_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_aesni_ocb_decrypt: + + +ALIGN 32 +__ocb_decrypt6: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + movdqa xmm14,xmm10 + pxor xmm10,xmm15 + movdqu xmm15,XMMWORD[r14*1+rbx] + pxor xmm11,xmm10 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm4,xmm12 + pxor xmm14,xmm13 + pxor xmm5,xmm13 + pxor xmm15,xmm14 + pxor xmm6,xmm14 + pxor xmm7,xmm15 + movups xmm0,XMMWORD[32+r11] + + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + add r8,6 + pxor xmm10,xmm9 + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + pxor xmm11,xmm9 + pxor xmm12,xmm9 +DB 102,15,56,222,241 + pxor xmm13,xmm9 + pxor xmm14,xmm9 +DB 102,15,56,222,249 + movups xmm1,XMMWORD[48+r11] + pxor xmm15,xmm9 + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 +DB 102,15,56,222,240 +DB 102,15,56,222,248 + movups xmm0,XMMWORD[64+r11] + shl r12,4 + shl r13,4 + jmp NEAR $L$ocb_dec_loop6 + +ALIGN 32 +$L$ocb_dec_loop6: +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 +DB 102,15,56,222,241 +DB 102,15,56,222,249 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 +DB 102,15,56,222,240 +DB 102,15,56,222,248 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_dec_loop6 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 +DB 102,15,56,222,241 +DB 102,15,56,222,249 + movups xmm1,XMMWORD[16+r11] + shl r14,4 + +DB 102,65,15,56,223,210 + movdqu xmm10,XMMWORD[rbx] + mov rax,r10 +DB 102,65,15,56,223,219 +DB 102,65,15,56,223,228 +DB 102,65,15,56,223,237 +DB 102,65,15,56,223,246 +DB 102,65,15,56,223,255 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_decrypt4: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + pxor xmm10,xmm15 + pxor xmm11,xmm10 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm4,xmm12 + pxor xmm5,xmm13 + movups xmm0,XMMWORD[32+r11] + + pxor xmm10,xmm9 + pxor xmm11,xmm9 + pxor xmm12,xmm9 + pxor xmm13,xmm9 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + movups xmm1,XMMWORD[48+r11] + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_dec_loop4 + +ALIGN 32 +$L$ocb_dec_loop4: +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_dec_loop4 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,65,15,56,223,210 +DB 102,65,15,56,223,219 +DB 102,65,15,56,223,228 +DB 102,65,15,56,223,237 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_decrypt1: + pxor xmm7,xmm15 + pxor xmm7,xmm9 + pxor xmm2,xmm7 + movups xmm0,XMMWORD[32+r11] + +DB 102,15,56,222,209 + movups xmm1,XMMWORD[48+r11] + pxor xmm7,xmm9 + +DB 102,15,56,222,208 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_dec_loop1 + +ALIGN 32 +$L$ocb_dec_loop1: +DB 102,15,56,222,209 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,222,208 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_dec_loop1 + +DB 102,15,56,222,209 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,15,56,223,215 + DB 0F3h,0C3h ;repret + global aesni_cbc_encrypt ALIGN 16 @@ -2837,7 +3722,7 @@ DB 102,15,56,223,209 jmp NEAR $L$cbc_ret ALIGN 16 $L$cbc_decrypt_bulk: - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,176 and rsp,-16 @@ -2852,7 +3737,7 @@ $L$cbc_decrypt_bulk: movaps XMMWORD[144+rsp],xmm14 movaps XMMWORD[160+rsp],xmm15 $L$cbc_decrypt_body: - lea rbp,[((-8))+rax] + mov rbp,rcx movups xmm10,XMMWORD[r8] mov eax,r10d cmp rdx,0x50 @@ -2892,7 +3777,7 @@ $L$cbc_dec_loop8_enter: pxor xmm3,xmm0 movups xmm1,XMMWORD[((16-112))+rcx] pxor xmm4,xmm0 - xor r11,r11 + mov rbp,-1 cmp rdx,0x70 pxor xmm5,xmm0 pxor xmm6,xmm0 @@ -2908,10 +3793,10 @@ DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 DB 102,68,15,56,222,193 - setnc r11b - shl r11,7 + adc rbp,0 + and rbp,128 DB 102,68,15,56,222,201 - add r11,rdi + add rbp,rdi movups xmm1,XMMWORD[((48-112))+rcx] DB 102,15,56,222,208 DB 102,15,56,222,216 @@ -3049,18 +3934,18 @@ DB 102,65,15,56,223,219 movdqu xmm0,XMMWORD[112+rdi] DB 102,65,15,56,223,228 lea rdi,[128+rdi] - movdqu xmm11,XMMWORD[r11] + movdqu xmm11,XMMWORD[rbp] DB 102,65,15,56,223,237 DB 102,65,15,56,223,246 - movdqu xmm12,XMMWORD[16+r11] - movdqu xmm13,XMMWORD[32+r11] + movdqu xmm12,XMMWORD[16+rbp] + movdqu xmm13,XMMWORD[32+rbp] DB 102,65,15,56,223,255 DB 102,68,15,56,223,193 - movdqu xmm14,XMMWORD[48+r11] - movdqu xmm15,XMMWORD[64+r11] + movdqu xmm14,XMMWORD[48+rbp] + movdqu xmm15,XMMWORD[64+rbp] DB 102,69,15,56,223,202 movdqa xmm10,xmm0 - movdqu xmm1,XMMWORD[80+r11] + movdqu xmm1,XMMWORD[80+rbp] movups xmm0,XMMWORD[((-112))+rcx] movups XMMWORD[rsi],xmm2 @@ -3179,7 +4064,7 @@ $L$cbc_dec_loop6_enter: pxor xmm5,xmm13 movdqu XMMWORD[32+rsi],xmm4 pxor xmm6,xmm14 - mov rcx,r11 + mov rcx,rbp movdqu XMMWORD[48+rsi],xmm5 pxor xmm7,xmm15 mov eax,r10d @@ -3348,8 +4233,8 @@ $L$cbc_dec_ret: movaps XMMWORD[144+rsp],xmm0 movaps xmm15,XMMWORD[160+rsp] movaps XMMWORD[160+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$cbc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -3865,13 +4750,75 @@ ctr_xts_se_handler: cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[160+r8] - lea rsi,[((-160))+rax] + mov rax,QWORD[208+r8] + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +ocb_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$ocb_no_xmm + + mov rax,QWORD[152+r8] + + lea rsi,[rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc + lea rax,[((160+40))+rax] - jmp NEAR $L$common_rbp_tail +$L$ocb_no_xmm: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + + jmp NEAR $L$common_seh_tail ALIGN 16 @@ -3894,9 +4841,13 @@ cbc_se_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov rax,QWORD[120+r8] + lea r10,[$L$cbc_decrypt_body] cmp rbx,r10 - jb NEAR $L$restore_cbc_rax + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] lea r10,[$L$cbc_ret] cmp rbx,r10 @@ -3907,15 +4858,10 @@ cbc_se_handler: mov ecx,20 DD 0xa548f3fc -$L$common_rbp_tail: - mov rax,QWORD[160+r8] - mov rbp,QWORD[rax] - lea rax,[8+rax] - mov QWORD[160+r8],rbp - jmp NEAR $L$common_seh_tail + mov rax,QWORD[208+r8] -$L$restore_cbc_rax: - mov rax,QWORD[120+r8] + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp $L$common_seh_tail: mov rdi,QWORD[8+rax] @@ -3982,6 +4928,14 @@ ALIGN 4 DD $L$SEH_begin_aesni_xts_decrypt wrt ..imagebase DD $L$SEH_end_aesni_xts_decrypt wrt ..imagebase DD $L$SEH_info_xts_dec wrt ..imagebase + + DD $L$SEH_begin_aesni_ocb_encrypt wrt ..imagebase + DD $L$SEH_end_aesni_ocb_encrypt wrt ..imagebase + DD $L$SEH_info_ocb_enc wrt ..imagebase + + DD $L$SEH_begin_aesni_ocb_decrypt wrt ..imagebase + DD $L$SEH_end_aesni_ocb_decrypt wrt ..imagebase + DD $L$SEH_info_ocb_dec wrt ..imagebase DD $L$SEH_begin_aesni_cbc_encrypt wrt ..imagebase DD $L$SEH_end_aesni_cbc_encrypt wrt ..imagebase DD $L$SEH_info_cbc wrt ..imagebase @@ -4019,6 +4973,18 @@ $L$SEH_info_xts_dec: DB 9,0,0,0 DD ctr_xts_se_handler wrt ..imagebase DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase +$L$SEH_info_ocb_enc: +DB 9,0,0,0 + DD ocb_se_handler wrt ..imagebase + DD $L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase + DD $L$ocb_enc_pop wrt ..imagebase + DD 0 +$L$SEH_info_ocb_dec: +DB 9,0,0,0 + DD ocb_se_handler wrt ..imagebase + DD $L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase + DD $L$ocb_dec_pop wrt ..imagebase + DD 0 $L$SEH_info_cbc: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/win-x86_64/crypto/aes/bsaes-x86_64.asm b/win-x86_64/crypto/aes/bsaes-x86_64.asm index 6d75248d..9c6d1293 100644 --- a/win-x86_64/crypto/aes/bsaes-x86_64.asm +++ b/win-x86_64/crypto/aes/bsaes-x86_64.asm @@ -1319,7 +1319,7 @@ $L$cbc_dec_bzero: cmp rbp,rax ja NEAR $L$cbc_dec_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -1330,15 +1330,15 @@ $L$cbc_dec_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$cbc_dec_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$cbc_dec_epilogue: DB 0F3h,0C3h ;repret @@ -1543,7 +1543,7 @@ $L$ctr_enc_bzero: cmp rbp,rax ja NEAR $L$ctr_enc_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -1554,15 +1554,15 @@ $L$ctr_enc_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$ctr_enc_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$ctr_enc_epilogue: DB 0F3h,0C3h ;repret @@ -2019,7 +2019,7 @@ $L$xts_enc_bzero: cmp rbp,rax ja NEAR $L$xts_enc_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -2030,15 +2030,15 @@ $L$xts_enc_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$xts_enc_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$xts_enc_epilogue: DB 0F3h,0C3h ;repret @@ -2522,7 +2522,7 @@ $L$xts_dec_bzero: cmp rbp,rax ja NEAR $L$xts_dec_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -2533,15 +2533,15 @@ $L$xts_dec_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$xts_dec_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$xts_dec_epilogue: DB 0F3h,0C3h ;repret @@ -2628,30 +2628,33 @@ se_handler: mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] + jbe NEAR $L$in_prologue mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_tail + mov rax,QWORD[160+r8] lea rsi,[64+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc - lea rax,[160+rax] - - mov rbp,QWORD[112+rax] - mov rbx,QWORD[104+rax] - mov r12,QWORD[96+rax] - mov r13,QWORD[88+rax] - mov r14,QWORD[80+rax] - mov r15,QWORD[72+rax] - lea rax,[120+rax] + lea rax,[((160+120))+rax] + +$L$in_tail: + mov rbp,QWORD[((-48))+rax] + mov rbx,QWORD[((-40))+rax] + mov r12,QWORD[((-32))+rax] + mov r13,QWORD[((-24))+rax] + mov r14,QWORD[((-16))+rax] + mov r15,QWORD[((-8))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 @@ -2719,15 +2722,23 @@ $L$cbc_dec_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase + DD $L$cbc_dec_tail wrt ..imagebase + DD 0 $L$ctr_enc_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase + DD $L$ctr_enc_tail wrt ..imagebase + DD 0 $L$xts_enc_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase + DD $L$xts_enc_tail wrt ..imagebase + DD 0 $L$xts_dec_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase + DD $L$xts_dec_tail wrt ..imagebase + DD 0 diff --git a/win-x86_64/crypto/bn/x86_64-mont.asm b/win-x86_64/crypto/bn/x86_64-mont.asm index 4d8e1cb7..1a9da512 100644 --- a/win-x86_64/crypto/bn/x86_64-mont.asm +++ b/win-x86_64/crypto/bn/x86_64-mont.asm @@ -23,6 +23,10 @@ $L$SEH_begin_bn_mul_mont: mov r9,QWORD[48+rsp] + + mov r9d,r9d + mov rax,rsp + test r9d,3 jnz NEAR $L$mul_enter cmp r9d,8 @@ -36,20 +40,50 @@ $L$SEH_begin_bn_mul_mont: ALIGN 16 $L$mul_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r9d,r9d - lea r10,[2+r9] + + neg r9 mov r11,rsp - neg r10 - lea rsp,[r10*8+rsp] - and rsp,-1024 + lea r10,[((-16))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +ALIGN 16 +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax - mov QWORD[8+r9*8+rsp],r11 $L$mul_body: mov r12,rdx mov r8,QWORD[r8] @@ -201,33 +235,43 @@ $L$sub: sbb rax,QWORD[r14*8+rcx] sbb rax,0 xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax mov r15,r9 + or rsi,rcx ALIGN 16 $L$copy: - mov rsi,QWORD[r14*8+rsp] - mov rcx,QWORD[r14*8+rdi] - xor rsi,rcx - and rsi,rax - xor rsi,rcx + mov rax,QWORD[r14*8+rsi] mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rsi + mov QWORD[r14*8+rdi],rax lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul_mont: ALIGN 16 @@ -244,22 +288,47 @@ $L$SEH_begin_bn_mul4x_mont: mov r9,QWORD[48+rsp] + + mov r9d,r9d + mov rax,rsp + $L$mul4x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r9d,r9d - lea r10,[4+r9] + + neg r9 mov r11,rsp - neg r10 - lea rsp,[r10*8+rsp] - and rsp,-1024 + lea r10,[((-32))+r9*8+rsp] + neg r9 + and r10,-1024 + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax - mov QWORD[8+r9*8+rsp],r11 $L$mul4x_body: mov QWORD[16+r9*8+rsp],rdi mov r12,rdx @@ -559,9 +628,11 @@ $L$inner4x: cmp r14,r9 jb NEAR $L$outer4x mov rdi,QWORD[16+r9*8+rsp] + lea r15,[((-4))+r9] mov rax,QWORD[rsp] + pxor xmm0,xmm0 mov rdx,QWORD[8+rsp] - shr r9,2 + shr r15,2 lea rsi,[rsp] xor r14,r14 @@ -569,7 +640,6 @@ $L$inner4x: mov rbx,QWORD[16+rsi] mov rbp,QWORD[24+rsi] sbb rdx,QWORD[8+rcx] - lea r15,[((-1))+r9] jmp NEAR $L$sub4x ALIGN 16 $L$sub4x: @@ -597,49 +667,57 @@ $L$sub4x: mov QWORD[16+r14*8+rdi],rbx sbb rax,0 -DB 66h, 48h, 0fh, 6eh, 0c0h - punpcklqdq xmm0,xmm0 mov QWORD[24+r14*8+rdi],rbp xor r14,r14 - - mov r15,r9 - pxor xmm5,xmm5 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax + lea r15,[((-4))+r9] + or rsi,rcx + shr r15,2 + + movdqu xmm1,XMMWORD[rsi] + movdqa XMMWORD[rsp],xmm0 + movdqu XMMWORD[rdi],xmm1 jmp NEAR $L$copy4x ALIGN 16 $L$copy4x: - movdqu xmm2,XMMWORD[r14*1+rsp] - movdqu xmm4,XMMWORD[16+r14*1+rsp] - movdqu xmm1,XMMWORD[r14*1+rdi] - movdqu xmm3,XMMWORD[16+r14*1+rdi] - pxor xmm2,xmm1 - pxor xmm4,xmm3 - pand xmm2,xmm0 - pand xmm4,xmm0 - pxor xmm2,xmm1 - pxor xmm4,xmm3 - movdqu XMMWORD[r14*1+rdi],xmm2 - movdqu XMMWORD[16+r14*1+rdi],xmm4 - movdqa XMMWORD[r14*1+rsp],xmm5 - movdqa XMMWORD[16+r14*1+rsp],xmm5 - + movdqu xmm2,XMMWORD[16+r14*1+rsi] + movdqu xmm1,XMMWORD[32+r14*1+rsi] + movdqa XMMWORD[16+r14*1+rsp],xmm0 + movdqu XMMWORD[16+r14*1+rdi],xmm2 + movdqa XMMWORD[32+r14*1+rsp],xmm0 + movdqu XMMWORD[32+r14*1+rdi],xmm1 lea r14,[32+r14] dec r15 jnz NEAR $L$copy4x - shl r9,2 + movdqu xmm2,XMMWORD[16+r14*1+rsi] + movdqa XMMWORD[16+r14*1+rsp],xmm0 + movdqu XMMWORD[16+r14*1+rdi],xmm2 mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul4x_mont: EXTERN bn_sqr8x_internal @@ -658,15 +736,24 @@ $L$SEH_begin_bn_sqr8x_mont: mov r9,QWORD[48+rsp] -$L$sqr8x_enter: + mov rax,rsp + +$L$sqr8x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$sqr8x_prologue: + mov r10d,r9d shl r9d,3 shl r10,3+2 @@ -678,30 +765,49 @@ $L$sqr8x_enter: lea r11,[((-64))+r9*2+rsp] + mov rbp,rsp mov r8,QWORD[r8] sub r11,rsi and r11,4095 cmp r10,r11 jb NEAR $L$sqr8x_sp_alt - sub rsp,r11 - lea rsp,[((-64))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-64))+r9*2+rbp] jmp NEAR $L$sqr8x_sp_done ALIGN 32 $L$sqr8x_sp_alt: lea r10,[((4096-64))+r9*2] - lea rsp,[((-64))+r9*2+rsp] + lea rbp,[((-64))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$sqr8x_sp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk + jmp NEAR $L$sqr8x_page_walk_done + +ALIGN 16 +$L$sqr8x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk +$L$sqr8x_page_walk_done: + mov r10,r9 neg r9 mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$sqr8x_body: DB 102,72,15,110,209 @@ -748,6 +854,7 @@ DB 102,72,15,110,200 pxor xmm0,xmm0 pshufd xmm1,xmm1,0 mov rsi,QWORD[40+rsp] + jmp NEAR $L$sqr8x_cond_copy ALIGN 32 @@ -777,16 +884,24 @@ $L$sqr8x_cond_copy: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$sqr8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_sqr8x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 @@ -829,22 +944,8 @@ mul_handler: mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] - lea rax,[48+rax] - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - jmp NEAR $L$common_seh_tail + jmp NEAR $L$common_pop_regs @@ -872,15 +973,21 @@ sqr_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + mov rax,QWORD[152+r8] - mov r10d,DWORD[4+r11] + mov r10d,DWORD[8+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov rax,QWORD[40+rax] +$L$common_pop_regs: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] @@ -960,4 +1067,5 @@ DB 9,0,0,0 $L$SEH_info_bn_sqr8x_mont: DB 9,0,0,0 DD sqr_handler wrt ..imagebase - DD $L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase + DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase +ALIGN 8 diff --git a/win-x86_64/crypto/bn/x86_64-mont5.asm b/win-x86_64/crypto/bn/x86_64-mont5.asm index 58f19ac2..b3306410 100644 --- a/win-x86_64/crypto/bn/x86_64-mont5.asm +++ b/win-x86_64/crypto/bn/x86_64-mont5.asm @@ -23,30 +23,64 @@ $L$SEH_begin_bn_mul_mont_gather5: mov r9,QWORD[48+rsp] + + mov r9d,r9d + mov rax,rsp + test r9d,7 jnz NEAR $L$mul_enter jmp NEAR $L$mul4x_enter ALIGN 16 $L$mul_enter: - mov r9d,r9d - mov rax,rsp movd xmm5,DWORD[56+rsp] - lea r10,[$L$inc] push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - lea r11,[2+r9] - neg r11 - lea rsp,[((-264))+r11*8+rsp] - and rsp,-1024 + neg r9 + mov r11,rsp + lea r10,[((-280))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + lea r10,[$L$inc] mov QWORD[8+r9*8+rsp],rax + $L$mul_body: + lea r12,[128+rdx] movdqa xmm0,XMMWORD[r10] movdqa xmm1,XMMWORD[16+r10] @@ -385,34 +419,44 @@ $L$sub: sbb rax,QWORD[r14*8+rcx] sbb rax,0 xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax mov r15,r9 + or rsi,rcx ALIGN 16 $L$copy: - mov rsi,QWORD[r14*8+rsp] - mov rcx,QWORD[r14*8+rdi] - xor rsi,rcx - and rsi,rax - xor rsi,rcx + mov rax,QWORD[r14*8+rsi] mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rsi + mov QWORD[r14*8+rdi],rax lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul_mont_gather5: ALIGN 32 @@ -429,16 +473,25 @@ $L$SEH_begin_bn_mul4x_mont_gather5: mov r9,QWORD[48+rsp] -$L$mul4x_enter: + DB 0x67 mov rax,rsp + +$L$mul4x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$mul4x_prologue: + DB 0x67 shl r9d,3 lea r10,[r9*2+r9] @@ -454,45 +507,72 @@ DB 0x67 lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$mul4xsp_alt - sub rsp,r11 - lea rsp,[((-320))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$mul4xsp_done ALIGN 32 $L$mul4xsp_alt: lea r10,[((4096-320))+r9*2] - lea rsp,[((-320))+r9*2+rsp] + lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$mul4xsp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + neg r9 mov QWORD[40+rsp],rax + $L$mul4x_body: call mul4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul4x_mont_gather5: @@ -1036,14 +1116,23 @@ $L$SEH_begin_bn_power5: mov r9,QWORD[48+rsp] + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$power5_prologue: + shl r9d,3 lea r10d,[r9*2+r9] neg r9 @@ -1057,24 +1146,41 @@ $L$SEH_begin_bn_power5: lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$pwr_sp_alt - sub rsp,r11 - lea rsp,[((-320))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$pwr_sp_done ALIGN 32 $L$pwr_sp_alt: lea r10,[((4096-320))+r9*2] - lea rsp,[((-320))+r9*2+rsp] + lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$pwr_sp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk + jmp NEAR $L$pwr_page_walk_done + +$L$pwr_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk +$L$pwr_page_walk_done: + mov r10,r9 neg r9 @@ -1089,6 +1195,7 @@ $L$pwr_sp_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$power5_body: DB 102,72,15,110,207 DB 102,72,15,110,209 @@ -1115,18 +1222,27 @@ DB 102,72,15,126,226 call mul4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$power5_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_power5: global bn_sqr8x_internal @@ -1989,15 +2105,24 @@ $L$SEH_begin_bn_from_mont8x: mov r9,QWORD[48+rsp] + DB 0x67 mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$from_prologue: + shl r9d,3 lea r10,[r9*2+r9] neg r9 @@ -2011,24 +2136,41 @@ DB 0x67 lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$from_sp_alt - sub rsp,r11 - lea rsp,[((-320))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$from_sp_done ALIGN 32 $L$from_sp_alt: lea r10,[((4096-320))+r9*2] - lea rsp,[((-320))+r9*2+rsp] + lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$from_sp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$from_page_walk + jmp NEAR $L$from_page_walk_done + +$L$from_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$from_page_walk +$L$from_page_walk_done: + mov r10,r9 neg r9 @@ -2043,6 +2185,7 @@ $L$from_sp_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$from_body: mov r11,r9 lea rax,[48+rsp] @@ -2078,11 +2221,12 @@ DB 102,73,15,110,218 pxor xmm0,xmm0 lea rax,[48+rsp] - mov rsi,QWORD[40+rsp] jmp NEAR $L$from_mont_zero ALIGN 32 $L$from_mont_zero: + mov rsi,QWORD[40+rsp] + movdqa XMMWORD[rax],xmm0 movdqa XMMWORD[16+rax],xmm0 movdqa XMMWORD[32+rax],xmm0 @@ -2093,16 +2237,24 @@ $L$from_mont_zero: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$from_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_from_mont8x: global bn_scatter5 @@ -2321,9 +2473,14 @@ mul_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + mov rax,QWORD[152+r8] - mov r10d,DWORD[4+r11] + mov r10d,DWORD[8+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail @@ -2335,11 +2492,11 @@ mul_handler: mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] - jmp NEAR $L$body_proceed + jmp NEAR $L$common_pop_regs $L$body_40: mov rax,QWORD[40+rax] -$L$body_proceed: +$L$common_pop_regs: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] @@ -2419,22 +2576,22 @@ ALIGN 8 $L$SEH_info_bn_mul_mont_gather5: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_mul4x_mont_gather5: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase + DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_power5: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase + DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_from_mont8x: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase + DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_gather5: DB 0x01,0x0b,0x03,0x0a diff --git a/win-x86_64/crypto/chacha/chacha-x86_64.asm b/win-x86_64/crypto/chacha/chacha-x86_64.asm index afebd2e0..cb362468 100644 --- a/win-x86_64/crypto/chacha/chacha-x86_64.asm +++ b/win-x86_64/crypto/chacha/chacha-x86_64.asm @@ -27,6 +27,15 @@ DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe $L$sigma: DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 DB 0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 @@ -59,6 +68,7 @@ $L$SEH_begin_ChaCha20_ctr32: push r14 push r15 sub rsp,64+24 +$L$ctr32_body: movdqu xmm1,XMMWORD[rcx] @@ -296,13 +306,14 @@ $L$oop_tail: jnz NEAR $L$oop_tail $L$done: - add rsp,64+24 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + lea rsi,[((64+24+48))+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$no_data: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -323,20 +334,15 @@ $L$SEH_begin_ChaCha20_ssse3: $L$ChaCha20_ssse3: + mov r9,rsp cmp rdx,128 ja NEAR $L$ChaCha20_4x $L$do_sse3_after_all: - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,64+72 - movaps XMMWORD[(64+32)+rsp],xmm6 - movaps XMMWORD[(64+48)+rsp],xmm7 + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$ssse3_body: movdqa xmm0,XMMWORD[$L$sigma] movdqu xmm1,XMMWORD[rcx] movdqu xmm2,XMMWORD[16+rcx] @@ -348,7 +354,7 @@ $L$do_sse3_after_all: movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 - mov ebp,10 + mov r8,10 jmp NEAR $L$oop_ssse3 ALIGN 32 @@ -358,7 +364,7 @@ $L$oop_outer_ssse3: movdqa xmm1,XMMWORD[16+rsp] movdqa xmm2,XMMWORD[32+rsp] paddd xmm3,XMMWORD[48+rsp] - mov ebp,10 + mov r8,10 movdqa XMMWORD[48+rsp],xmm3 jmp NEAR $L$oop_ssse3 @@ -407,7 +413,7 @@ DB 102,15,56,0,223 pshufd xmm2,xmm2,78 pshufd xmm1,xmm1,147 pshufd xmm3,xmm3,57 - dec ebp + dec r8 jnz NEAR $L$oop_ssse3 paddd xmm0,XMMWORD[rsp] paddd xmm1,XMMWORD[16+rsp] @@ -444,27 +450,22 @@ $L$tail_ssse3: movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 - xor rbx,rbx + xor r8,r8 $L$oop_tail_ssse3: - movzx eax,BYTE[rbx*1+rsi] - movzx ecx,BYTE[rbx*1+rsp] - lea rbx,[1+rbx] + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] xor eax,ecx - mov BYTE[((-1))+rbx*1+rdi],al + mov BYTE[((-1))+r8*1+rdi],al dec rdx jnz NEAR $L$oop_tail_ssse3 $L$done_ssse3: - movaps xmm6,XMMWORD[((64+32))+rsp] - movaps xmm7,XMMWORD[((64+48))+rsp] - add rsp,64+72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] +$L$ssse3_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret @@ -484,6 +485,7 @@ $L$SEH_begin_ChaCha20_4x: $L$ChaCha20_4x: + mov r9,rsp mov r11,r10 shr r10,32 test r10,32 @@ -496,18 +498,18 @@ $L$ChaCha20_4x: je NEAR $L$do_sse3_after_all $L$proceed4x: - lea r11,[((-120))+rsp] - sub rsp,0x148+160 - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: movdqa xmm11,XMMWORD[$L$sigma] movdqu xmm15,XMMWORD[rcx] movdqu xmm7,XMMWORD[16+rcx] @@ -1034,18 +1036,18 @@ $L$oop_tail4x: jnz NEAR $L$oop_tail4x $L$done4x: - lea r11,[((320+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - add rsp,0x148+160 + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] +$L$4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret @@ -1065,22 +1067,21 @@ $L$SEH_begin_ChaCha20_8x: $L$ChaCha20_8x: - mov r10,rsp - sub rsp,0x280+176 + mov r9,rsp + sub rsp,0x280+168 and rsp,-32 - lea r11,[((656+48))+rsp] - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: vzeroupper - mov QWORD[640+rsp],r10 @@ -1671,19 +1672,220 @@ $L$oop_tail8x: $L$done8x: vzeroall - lea r11,[((656+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - mov rsp,QWORD[640+rsp] + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] +$L$8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_ChaCha20_8x: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +ssse3_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-40))+rax] + lea rdi,[512+r8] + mov ecx,4 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase + DD $L$SEH_end_ChaCha20_8x wrt ..imagebase + DD $L$SEH_info_ChaCha20_8x wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ChaCha20_ctr32: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ssse3: +DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase + +$L$SEH_info_ChaCha20_4x: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase +$L$SEH_info_ChaCha20_8x: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase diff --git a/win-x86_64/crypto/modes/ghash-x86_64.asm b/win-x86_64/crypto/modes/ghash-x86_64.asm index e5204bf8..b01f98c9 100644 --- a/win-x86_64/crypto/modes/ghash-x86_64.asm +++ b/win-x86_64/crypto/modes/ghash-x86_64.asm @@ -21,6 +21,10 @@ $L$SEH_begin_gcm_gmult_4bit: push rbx push rbp push r12 + push r13 + push r14 + push r15 + sub rsp,280 $L$gmult_prologue: movzx r8,BYTE[15+rdi] @@ -97,8 +101,9 @@ $L$break1: mov QWORD[8+rdi],r8 mov QWORD[rdi],r9 - mov rbx,QWORD[16+rsp] - lea rsp,[24+rsp] + lea rsi,[((280+48))+rsp] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$gmult_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -669,14 +674,14 @@ $L$outer_loop: mov QWORD[8+rdi],r8 mov QWORD[rdi],r9 - lea rsi,[280+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + lea rsi,[((280+48))+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$ghash_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1916,14 +1921,20 @@ se_handler: cmp rbx,r10 jae NEAR $L$in_prologue - lea rax,[24+rax] + lea rax,[((48+280))+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 $L$in_prologue: mov rdi,QWORD[8+rax] diff --git a/win-x86_64/crypto/sha/sha1-x86_64.asm b/win-x86_64/crypto/sha/sha1-x86_64.asm index 168f78db..54845743 100644 --- a/win-x86_64/crypto/sha/sha1-x86_64.asm +++ b/win-x86_64/crypto/sha/sha1-x86_64.asm @@ -1263,21 +1263,20 @@ $L$SEH_begin_sha1_block_data_order_ssse3: _ssse3_shortcut: - mov rax,rsp + mov r11,rsp push rbx push rbp push r12 push r13 push r14 lea rsp,[((-160))+rsp] - movaps XMMWORD[(-40-96)+rax],xmm6 - movaps XMMWORD[(-40-80)+rax],xmm7 - movaps XMMWORD[(-40-64)+rax],xmm8 - movaps XMMWORD[(-40-48)+rax],xmm9 - movaps XMMWORD[(-40-32)+rax],xmm10 - movaps XMMWORD[(-40-16)+rax],xmm11 + movaps XMMWORD[(-40-96)+r11],xmm6 + movaps XMMWORD[(-40-80)+r11],xmm7 + movaps XMMWORD[(-40-64)+r11],xmm8 + movaps XMMWORD[(-40-48)+r11],xmm9 + movaps XMMWORD[(-40-32)+r11],xmm10 + movaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_ssse3: - mov r14,rax and rsp,-64 mov r8,rdi mov r9,rsi @@ -1285,7 +1284,7 @@ $L$prologue_ssse3: shl r10,6 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] mov ebx,DWORD[4+r8] @@ -1297,8 +1296,8 @@ $L$prologue_ssse3: xor edi,edx and esi,edi - movdqa xmm6,XMMWORD[64+r11] - movdqa xmm9,XMMWORD[((-64))+r11] + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] movdqu xmm0,XMMWORD[r9] movdqu xmm1,XMMWORD[16+r9] movdqu xmm2,XMMWORD[32+r9] @@ -1374,7 +1373,7 @@ $L$oop_ssse3: pslld xmm9,2 pxor xmm4,xmm10 xor edx,ebp - movdqa xmm10,XMMWORD[((-64))+r11] + movdqa xmm10,XMMWORD[((-64))+r14] rol ecx,5 add ebx,edi and esi,edx @@ -1435,7 +1434,7 @@ $L$oop_ssse3: pslld xmm10,2 pxor xmm5,xmm8 xor ebp,eax - movdqa xmm8,XMMWORD[((-32))+r11] + movdqa xmm8,XMMWORD[((-32))+r14] rol edx,5 add ecx,edi and esi,ebp @@ -1496,7 +1495,7 @@ $L$oop_ssse3: pslld xmm8,2 pxor xmm6,xmm9 xor eax,ebx - movdqa xmm9,XMMWORD[((-32))+r11] + movdqa xmm9,XMMWORD[((-32))+r14] rol ebp,5 add edx,edi and esi,eax @@ -1557,7 +1556,7 @@ $L$oop_ssse3: pslld xmm9,2 pxor xmm7,xmm10 xor ebx,ecx - movdqa xmm10,XMMWORD[((-32))+r11] + movdqa xmm10,XMMWORD[((-32))+r14] rol eax,5 add ebp,edi and esi,ebx @@ -1668,7 +1667,7 @@ $L$oop_ssse3: pxor xmm2,xmm3 add eax,esi xor edi,edx - movdqa xmm10,XMMWORD[r11] + movdqa xmm10,XMMWORD[r14] ror ecx,7 paddd xmm9,xmm1 add eax,ebx @@ -1903,7 +1902,7 @@ $L$oop_ssse3: pxor xmm7,xmm0 rol ebx,5 add eax,esi - movdqa xmm9,XMMWORD[32+r11] + movdqa xmm9,XMMWORD[32+r14] xor edi,ecx paddd xmm8,xmm6 xor ecx,edx @@ -2194,8 +2193,8 @@ $L$oop_ssse3: add ecx,edx cmp r9,r10 je NEAR $L$done_ssse3 - movdqa xmm6,XMMWORD[64+r11] - movdqa xmm9,XMMWORD[((-64))+r11] + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] movdqu xmm0,XMMWORD[r9] movdqu xmm1,XMMWORD[16+r9] movdqu xmm2,XMMWORD[32+r9] @@ -2432,19 +2431,18 @@ $L$done_ssse3: mov DWORD[8+r8],ecx mov DWORD[12+r8],edx mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2463,7 +2461,7 @@ $L$SEH_begin_sha1_block_data_order_avx: _avx_shortcut: - mov rax,rsp + mov r11,rsp push rbx push rbp push r12 @@ -2471,14 +2469,13 @@ _avx_shortcut: push r14 lea rsp,[((-160))+rsp] vzeroupper - vmovaps XMMWORD[(-40-96)+rax],xmm6 - vmovaps XMMWORD[(-40-80)+rax],xmm7 - vmovaps XMMWORD[(-40-64)+rax],xmm8 - vmovaps XMMWORD[(-40-48)+rax],xmm9 - vmovaps XMMWORD[(-40-32)+rax],xmm10 - vmovaps XMMWORD[(-40-16)+rax],xmm11 + vmovaps XMMWORD[(-40-96)+r11],xmm6 + vmovaps XMMWORD[(-40-80)+r11],xmm7 + vmovaps XMMWORD[(-40-64)+r11],xmm8 + vmovaps XMMWORD[(-40-48)+r11],xmm9 + vmovaps XMMWORD[(-40-32)+r11],xmm10 + vmovaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_avx: - mov r14,rax and rsp,-64 mov r8,rdi mov r9,rsi @@ -2486,7 +2483,7 @@ $L$prologue_avx: shl r10,6 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] mov ebx,DWORD[4+r8] @@ -2498,8 +2495,8 @@ $L$prologue_avx: xor edi,edx and esi,edi - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] vmovdqu xmm2,XMMWORD[32+r9] @@ -2624,7 +2621,7 @@ $L$oop_avx: vpxor xmm5,xmm5,xmm10 xor ebp,eax shld edx,edx,5 - vmovdqa xmm11,XMMWORD[((-32))+r11] + vmovdqa xmm11,XMMWORD[((-32))+r14] add ecx,edi and esi,ebp xor ebp,eax @@ -2837,7 +2834,7 @@ $L$oop_avx: add eax,esi xor edi,edx vpaddd xmm9,xmm11,xmm1 - vmovdqa xmm11,XMMWORD[r11] + vmovdqa xmm11,XMMWORD[r14] shrd ecx,ecx,7 add eax,ebx vpxor xmm2,xmm2,xmm8 @@ -3056,7 +3053,7 @@ $L$oop_avx: mov edi,ebx xor esi,edx vpaddd xmm9,xmm11,xmm6 - vmovdqa xmm11,XMMWORD[32+r11] + vmovdqa xmm11,XMMWORD[32+r14] shld ebx,ebx,5 add eax,esi vpxor xmm7,xmm7,xmm8 @@ -3335,8 +3332,8 @@ $L$oop_avx: add ecx,edx cmp r9,r10 je NEAR $L$done_avx - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] vmovdqu xmm2,XMMWORD[32+r9] @@ -3572,19 +3569,18 @@ $L$done_avx: mov DWORD[8+r8],ecx mov DWORD[12+r8],edx mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -3677,15 +3673,13 @@ ssse3_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail - mov rax,QWORD[152+r8] + mov rax,QWORD[208+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[232+r8] - lea rsi,[((-40-96))+rax] lea rdi,[512+r8] mov ecx,12 diff --git a/win-x86_64/crypto/sha/sha256-x86_64.asm b/win-x86_64/crypto/sha/sha256-x86_64.asm index efaf9b55..6e3d1541 100644 --- a/win-x86_64/crypto/sha/sha256-x86_64.asm +++ b/win-x86_64/crypto/sha/sha256-x86_64.asm @@ -30,13 +30,13 @@ $L$SEH_begin_sha256_block_data_order: je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,16*4+4*8 lea rdx,[rdx*4+rsi] @@ -44,7 +44,7 @@ $L$SEH_begin_sha256_block_data_order: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+24))+rsp],rax $L$prologue: mov eax,DWORD[rdi] @@ -1709,13 +1709,13 @@ $L$rounds_16_xx: jb NEAR $L$loop mov rsi,QWORD[((64+24))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1781,13 +1781,13 @@ $L$SEH_begin_sha256_block_data_order_ssse3: $L$ssse3_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -1795,7 +1795,7 @@ $L$ssse3_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+24))+rsp],rax movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -2870,13 +2870,13 @@ DB 102,15,58,15,249,4 movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2895,13 +2895,13 @@ $L$SEH_begin_sha256_block_data_order_avx: $L$avx_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -2909,7 +2909,7 @@ $L$avx_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+24))+rsp],rax movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -3946,13 +3946,13 @@ $L$avx_00_47: movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -3992,7 +3992,6 @@ se_handler: jae NEAR $L$in_prologue mov rsi,rax mov rax,QWORD[((64+24))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/win-x86_64/crypto/sha/sha512-x86_64.asm b/win-x86_64/crypto/sha/sha512-x86_64.asm index 71449cd2..d0d7a43f 100644 --- a/win-x86_64/crypto/sha/sha512-x86_64.asm +++ b/win-x86_64/crypto/sha/sha512-x86_64.asm @@ -30,13 +30,13 @@ $L$SEH_begin_sha512_block_data_order: or r10d,r9d cmp r10d,1342177792 je NEAR $L$avx_shortcut + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,16*8+4*8 lea rdx,[rdx*8+rsi] @@ -44,7 +44,7 @@ $L$SEH_begin_sha512_block_data_order: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[((128+24))+rsp],rax $L$prologue: mov rax,QWORD[rdi] @@ -1709,13 +1709,13 @@ $L$rounds_16_xx: jb NEAR $L$loop mov rsi,QWORD[((128+24))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1825,13 +1825,13 @@ $L$SEH_begin_sha512_block_data_order_xop: $L$xop_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -1839,7 +1839,7 @@ $L$xop_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[((128+24))+rsp],rax movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -2906,13 +2906,13 @@ DB 143,72,120,195,203,42 movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_xop: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2931,13 +2931,13 @@ $L$SEH_begin_sha512_block_data_order_avx: $L$avx_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -2945,7 +2945,7 @@ $L$avx_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[((128+24))+rsp],rax movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -4076,13 +4076,13 @@ $L$avx_00_47: movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -4122,7 +4122,6 @@ se_handler: jae NEAR $L$in_prologue mov rsi,rax mov rax,QWORD[((128+24))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] |