summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Sloan <varomodt@google.com>2017-02-21 08:49:28 -0800
committerRobert Sloan <varomodt@google.com>2017-02-21 08:49:42 -0800
commita94fe0531b3c196ad078174259af2201b2e3a246 (patch)
tree81f252f2c833966b0a5d3ec52e71c3f9dbeca499
parent5d625781eb8ff5cc8111d2302efe900103bf0ade (diff)
downloadboringssl-a94fe0531b3c196ad078174259af2201b2e3a246.tar.gz
external/boringssl: Sync to c4796c92e0aced2342ed5687201aea07189c3bc1.
This includes the following changes: https://boringssl.googlesource.com/boringssl/+log/040bc4944be97f5d4b44da176f6e801fc804a176..c4796c92e0aced2342ed5687201aea07189c3bc1 Test: CtsLibcoreTestCases Presubmits Change-Id: If6d911660fbd9c60896527addb277c8225c3d401
-rw-r--r--BORINGSSL_REVISION2
-rw-r--r--linux-arm/crypto/chacha/chacha-armv4.S2
-rw-r--r--linux-x86/crypto/bn/x86-mont.S126
-rw-r--r--linux-x86_64/crypto/aes/aes-x86_64.S41
-rw-r--r--linux-x86_64/crypto/aes/aesni-x86_64.S961
-rw-r--r--linux-x86_64/crypto/aes/bsaes-x86_64.S68
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont.S237
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont5.S212
-rw-r--r--linux-x86_64/crypto/chacha/chacha-x86_64.S73
-rw-r--r--linux-x86_64/crypto/modes/ghash-x86_64.S25
-rw-r--r--linux-x86_64/crypto/sha/sha1-x86_64.S70
-rw-r--r--linux-x86_64/crypto/sha/sha256-x86_64.S54
-rw-r--r--linux-x86_64/crypto/sha/sha512-x86_64.S54
-rw-r--r--mac-x86/crypto/bn/x86-mont.S126
-rw-r--r--mac-x86_64/crypto/aes/aes-x86_64.S41
-rw-r--r--mac-x86_64/crypto/aes/aesni-x86_64.S961
-rw-r--r--mac-x86_64/crypto/aes/bsaes-x86_64.S68
-rw-r--r--mac-x86_64/crypto/bn/x86_64-mont.S237
-rw-r--r--mac-x86_64/crypto/bn/x86_64-mont5.S212
-rw-r--r--mac-x86_64/crypto/chacha/chacha-x86_64.S73
-rw-r--r--mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S104
-rw-r--r--mac-x86_64/crypto/modes/ghash-x86_64.S25
-rw-r--r--mac-x86_64/crypto/sha/sha1-x86_64.S70
-rw-r--r--mac-x86_64/crypto/sha/sha256-x86_64.S54
-rw-r--r--mac-x86_64/crypto/sha/sha512-x86_64.S54
-rw-r--r--sources.bp10
-rw-r--r--sources.mk2
-rw-r--r--src/CMakeLists.txt4
-rw-r--r--src/crypto/CMakeLists.txt4
-rwxr-xr-xsrc/crypto/aes/asm/aes-586.pl12
-rwxr-xr-x[-rw-r--r--]src/crypto/aes/asm/aes-x86_64.pl46
-rw-r--r--src/crypto/aes/asm/aesni-x86.pl4
-rw-r--r--src/crypto/aes/asm/aesni-x86_64.pl1286
-rw-r--r--src/crypto/aes/asm/aesv8-armx.pl6
-rw-r--r--src/crypto/aes/asm/bsaes-armv7.pl2
-rw-r--r--src/crypto/aes/asm/bsaes-x86_64.pl167
-rw-r--r--src/crypto/aes/asm/vpaes-x86.pl8
-rw-r--r--src/crypto/aes/asm/vpaes-x86_64.pl21
-rw-r--r--src/crypto/asn1/CMakeLists.txt1
-rw-r--r--src/crypto/asn1/a_gentm.c1
-rw-r--r--src/crypto/asn1/a_time.c1
-rw-r--r--src/crypto/asn1/a_utctm.c1
-rw-r--r--src/crypto/asn1/asn1_locl.h35
-rw-r--r--src/crypto/asn1/tasn_dec.c2
-rw-r--r--src/crypto/asn1/tasn_new.c12
-rw-r--r--src/crypto/asn1/time_support.c (renamed from src/crypto/time_support.c)2
-rw-r--r--src/crypto/bn/asm/armv4-mont.pl2
-rw-r--r--src/crypto/bn/asm/bn-586.pl24
-rw-r--r--src/crypto/bn/asm/co-586.pl12
-rwxr-xr-xsrc/crypto/bn/asm/rsaz-avx2.pl50
-rwxr-xr-x[-rw-r--r--]src/crypto/bn/asm/x86-mont.pl65
-rwxr-xr-xsrc/crypto/bn/asm/x86_64-mont.pl316
-rwxr-xr-xsrc/crypto/bn/asm/x86_64-mont5.pl348
-rw-r--r--src/crypto/chacha/CMakeLists.txt12
-rwxr-xr-xsrc/crypto/chacha/asm/chacha-armv4.pl6
-rwxr-xr-xsrc/crypto/chacha/asm/chacha-armv8.pl4
-rwxr-xr-xsrc/crypto/chacha/asm/chacha-x86.pl15
-rwxr-xr-xsrc/crypto/chacha/asm/chacha-x86_64.pl1158
-rw-r--r--src/crypto/chacha/chacha_test.cc42
-rw-r--r--src/crypto/cipher/cipher.c6
-rw-r--r--src/crypto/ec/CMakeLists.txt11
-rwxr-xr-xsrc/crypto/ec/asm/p256-x86_64-asm.pl12
-rw-r--r--src/crypto/ec/ec_test.cc450
-rw-r--r--src/crypto/ecdsa/ecdsa.c20
-rw-r--r--src/crypto/evp/evp_ctx.c48
-rw-r--r--src/crypto/md5/asm/md5-586.pl2
-rw-r--r--src/crypto/modes/asm/aesni-gcm-x86_64.pl9
-rw-r--r--src/crypto/modes/asm/ghash-armv4.pl4
-rw-r--r--src/crypto/modes/asm/ghash-x86.pl12
-rw-r--r--src/crypto/modes/asm/ghash-x86_64.pl49
-rw-r--r--src/crypto/perlasm/ppc-xlate.pl2
-rw-r--r--src/crypto/perlasm/readme6
-rwxr-xr-xsrc/crypto/perlasm/x86_64-xlate.pl261
-rw-r--r--src/crypto/perlasm/x86nasm.pl2
-rw-r--r--src/crypto/pkcs8/pkcs8.c63
-rw-r--r--src/crypto/rsa/CMakeLists.txt11
-rw-r--r--src/crypto/rsa/rsa_test.cc521
-rw-r--r--src/crypto/sha/asm/sha1-586.pl4
-rwxr-xr-x[-rw-r--r--]src/crypto/sha/asm/sha1-x86_64.pl132
-rw-r--r--src/crypto/sha/asm/sha256-586.pl9
-rw-r--r--src/crypto/sha/asm/sha512-586.pl4
-rw-r--r--src/crypto/sha/asm/sha512-armv8.pl2
-rwxr-xr-x[-rw-r--r--]src/crypto/sha/asm/sha512-x86_64.pl99
-rw-r--r--src/crypto/test/test_util.h4
-rw-r--r--src/crypto/x509/x_name.c18
-rw-r--r--src/include/openssl/ecdsa.h16
-rw-r--r--src/include/openssl/pkcs8.h63
-rw-r--r--src/include/openssl/ssl.h35
-rw-r--r--src/include/openssl/ssl3.h1
-rw-r--r--src/include/openssl/time_support.h91
-rw-r--r--src/ssl/handshake_client.c146
-rw-r--r--src/ssl/handshake_server.c153
-rw-r--r--src/ssl/internal.h57
-rw-r--r--src/ssl/s3_both.c12
-rw-r--r--src/ssl/s3_lib.c1
-rw-r--r--src/ssl/ssl_asn1.c15
-rw-r--r--src/ssl/ssl_cert.c62
-rw-r--r--src/ssl/ssl_lib.c149
-rw-r--r--src/ssl/ssl_session.c43
-rw-r--r--src/ssl/ssl_stat.c15
-rw-r--r--src/ssl/ssl_test.cc26
-rw-r--r--src/ssl/t1_enc.c13
-rw-r--r--src/ssl/t1_lib.c81
-rw-r--r--src/ssl/test/bssl_shim.cc10
-rw-r--r--src/ssl/test/runner/runner.go6
-rw-r--r--src/ssl/tls13_both.c47
-rw-r--r--src/ssl/tls13_client.c26
-rw-r--r--src/ssl/tls13_enc.c14
-rw-r--r--src/ssl/tls13_server.c45
-rw-r--r--src/tool/transport_common.cc5
-rw-r--r--src/util/all_tests.json3
-rw-r--r--src/util/doc.config3
-rw-r--r--src/util/generate_build_files.py44
-rw-r--r--win-x86/crypto/bn/x86-mont.asm120
-rw-r--r--win-x86_64/crypto/aes/aes-x86_64.asm42
-rw-r--r--win-x86_64/crypto/aes/aesni-x86_64.asm1318
-rw-r--r--win-x86_64/crypto/aes/bsaes-x86_64.asm115
-rw-r--r--win-x86_64/crypto/bn/x86_64-mont.asm264
-rw-r--r--win-x86_64/crypto/bn/x86_64-mont5.asm231
-rw-r--r--win-x86_64/crypto/chacha/chacha-x86_64.asm370
-rw-r--r--win-x86_64/crypto/modes/ghash-x86_64.asm33
-rw-r--r--win-x86_64/crypto/sha/sha1-x86_64.asm122
-rw-r--r--win-x86_64/crypto/sha/sha256-x86_64.asm55
-rw-r--r--win-x86_64/crypto/sha/sha512-x86_64.asm55
124 files changed, 9625 insertions, 3674 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index f26983b6..af7f21c4 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-040bc4944be97f5d4b44da176f6e801fc804a176
+c4796c92e0aced2342ed5687201aea07189c3bc1
diff --git a/linux-arm/crypto/chacha/chacha-armv4.S b/linux-arm/crypto/chacha/chacha-armv4.S
index 19a4d2c4..0784fc71 100644
--- a/linux-arm/crypto/chacha/chacha-armv4.S
+++ b/linux-arm/crypto/chacha/chacha-armv4.S
@@ -1457,7 +1457,7 @@ ChaCha20_neon:
ldrb r9,[r12],#1 @ read input
subs r11,r11,#1
eor r8,r8,r9
- strb r8,[r14],#1 @ store ouput
+ strb r8,[r14],#1 @ store output
bne .Loop_tail_neon
.Ldone_neon:
diff --git a/linux-x86/crypto/bn/x86-mont.S b/linux-x86/crypto/bn/x86-mont.S
index 1569b2cf..e291a888 100644
--- a/linux-x86/crypto/bn/x86-mont.S
+++ b/linux-x86/crypto/bn/x86-mont.S
@@ -17,39 +17,54 @@ bn_mul_mont:
jl .L000just_leave
leal 20(%esp),%esi
leal 24(%esp),%edx
- movl %esp,%ebp
addl $2,%edi
negl %edi
- leal -32(%esp,%edi,4),%esp
+ leal -32(%esp,%edi,4),%ebp
negl %edi
- movl %esp,%eax
+ movl %ebp,%eax
subl %edx,%eax
andl $2047,%eax
- subl %eax,%esp
- xorl %esp,%edx
+ subl %eax,%ebp
+ xorl %ebp,%edx
andl $2048,%edx
xorl $2048,%edx
- subl %edx,%esp
- andl $-64,%esp
+ subl %edx,%ebp
+ andl $-64,%ebp
+ movl %esp,%eax
+ subl %ebp,%eax
+ andl $-4096,%eax
+ movl %esp,%edx
+ leal (%ebp,%eax,1),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja .L001page_walk
+ jmp .L002page_walk_done
+.align 16
+.L001page_walk:
+ leal -4096(%esp),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja .L001page_walk
+.L002page_walk_done:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
- movl 12(%esi),%edx
+ movl 12(%esi),%ebp
movl 16(%esi),%esi
movl (%esi),%esi
movl %eax,4(%esp)
movl %ebx,8(%esp)
movl %ecx,12(%esp)
- movl %edx,16(%esp)
+ movl %ebp,16(%esp)
movl %esi,20(%esp)
leal -3(%edi),%ebx
- movl %ebp,24(%esp)
- call .L001PIC_me_up
-.L001PIC_me_up:
+ movl %edx,24(%esp)
+ call .L003PIC_me_up
+.L003PIC_me_up:
popl %eax
- leal OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax
+ leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
btl $26,(%eax)
- jnc .L002non_sse2
+ jnc .L004non_sse2
movl $-1,%eax
movd %eax,%mm7
movl 8(%esp),%esi
@@ -73,7 +88,7 @@ bn_mul_mont:
psrlq $32,%mm3
incl %ecx
.align 16
-.L0031st:
+.L0051st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -88,7 +103,7 @@ bn_mul_mont:
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
- jl .L0031st
+ jl .L0051st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -102,7 +117,7 @@ bn_mul_mont:
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
-.L004outer:
+.L006outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
@@ -124,7 +139,7 @@ bn_mul_mont:
paddq %mm6,%mm2
incl %ecx
decl %ebx
-.L005inner:
+.L007inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -141,7 +156,7 @@ bn_mul_mont:
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
- jnz .L005inner
+ jnz .L007inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
@@ -159,11 +174,11 @@ bn_mul_mont:
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
- jle .L004outer
+ jle .L006outer
emms
- jmp .L006common_tail
+ jmp .L008common_tail
.align 16
-.L002non_sse2:
+.L004non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -174,12 +189,12 @@ bn_mul_mont:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz .L007bn_sqr_mont
+ jz .L009bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 16
-.L008mull:
+.L010mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -188,7 +203,7 @@ bn_mul_mont:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L008mull
+ jl .L010mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -206,9 +221,9 @@ bn_mul_mont:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp .L0092ndmadd
+ jmp .L0112ndmadd
.align 16
-.L0101stmadd:
+.L0121stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -219,7 +234,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl .L0101stmadd
+ jl .L0121stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -242,7 +257,7 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
.align 16
-.L0092ndmadd:
+.L0112ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -253,7 +268,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0092ndmadd
+ jl .L0112ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -269,16 +284,16 @@ bn_mul_mont:
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
- je .L006common_tail
+ je .L008common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp .L0101stmadd
+ jmp .L0121stmadd
.align 16
-.L007bn_sqr_mont:
+.L009bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -289,7 +304,7 @@ bn_mul_mont:
andl $1,%ebx
incl %ecx
.align 16
-.L011sqr:
+.L013sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -301,7 +316,7 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl .L011sqr
+ jl .L013sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -325,7 +340,7 @@ bn_mul_mont:
movl 4(%esi),%eax
movl $1,%ecx
.align 16
-.L0123rdmadd:
+.L0143rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -344,7 +359,7 @@ bn_mul_mont:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl .L0123rdmadd
+ jl .L0143rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -360,7 +375,7 @@ bn_mul_mont:
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
- je .L006common_tail
+ je .L008common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
@@ -372,12 +387,12 @@ bn_mul_mont:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je .L013sqrlast
+ je .L015sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 16
-.L014sqradd:
+.L016sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -393,13 +408,13 @@ bn_mul_mont:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle .L014sqradd
+ jle .L016sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-.L013sqrlast:
+.L015sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -414,9 +429,9 @@ bn_mul_mont:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp .L0123rdmadd
+ jmp .L0143rdmadd
.align 16
-.L006common_tail:
+.L008common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -424,25 +439,26 @@ bn_mul_mont:
movl %ebx,%ecx
xorl %edx,%edx
.align 16
-.L015sub:
+.L017sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge .L015sub
+ jge .L017sub
sbbl $0,%eax
+ andl %eax,%esi
+ notl %eax
+ movl %edi,%ebp
+ andl %eax,%ebp
+ orl %ebp,%esi
.align 16
-.L016copy:
- movl (%esi,%ebx,4),%edx
- movl (%edi,%ebx,4),%ebp
- xorl %ebp,%edx
- andl %eax,%edx
- xorl %ebp,%edx
- movl %ecx,(%esi,%ebx,4)
- movl %edx,(%edi,%ebx,4)
+.L018copy:
+ movl (%esi,%ebx,4),%eax
+ movl %eax,(%edi,%ebx,4)
+ movl %ecx,32(%esp,%ebx,4)
decl %ebx
- jge .L016copy
+ jge .L018copy
movl 24(%esp),%esp
movl $1,%eax
.L000just_leave:
diff --git a/linux-x86_64/crypto/aes/aes-x86_64.S b/linux-x86_64/crypto/aes/aes-x86_64.S
index 361e84c7..ab1168ed 100644
--- a/linux-x86_64/crypto/aes/aes-x86_64.S
+++ b/linux-x86_64/crypto/aes/aes-x86_64.S
@@ -332,6 +332,7 @@ _x86_64_AES_encrypt_compact:
.type asm_AES_encrypt,@function
.hidden asm_AES_encrypt
asm_AES_encrypt:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
@@ -340,7 +341,6 @@ asm_AES_encrypt:
pushq %r15
- movq %rsp,%r10
leaq -63(%rdx),%rcx
andq $-64,%rsp
subq %rsp,%rcx
@@ -350,7 +350,7 @@ asm_AES_encrypt:
subq $32,%rsp
movq %rsi,16(%rsp)
- movq %r10,24(%rsp)
+ movq %rax,24(%rsp)
.Lenc_prologue:
movq %rdx,%r15
@@ -382,13 +382,13 @@ asm_AES_encrypt:
movl %ecx,8(%r9)
movl %edx,12(%r9)
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lenc_epilogue:
.byte 0xf3,0xc3
.size asm_AES_encrypt,.-asm_AES_encrypt
@@ -778,6 +778,7 @@ _x86_64_AES_decrypt_compact:
.type asm_AES_decrypt,@function
.hidden asm_AES_decrypt
asm_AES_decrypt:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
@@ -786,7 +787,6 @@ asm_AES_decrypt:
pushq %r15
- movq %rsp,%r10
leaq -63(%rdx),%rcx
andq $-64,%rsp
subq %rsp,%rcx
@@ -796,7 +796,7 @@ asm_AES_decrypt:
subq $32,%rsp
movq %rsi,16(%rsp)
- movq %r10,24(%rsp)
+ movq %rax,24(%rsp)
.Ldec_prologue:
movq %rdx,%r15
@@ -830,13 +830,13 @@ asm_AES_decrypt:
movl %ecx,8(%r9)
movl %edx,12(%r9)
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Ldec_epilogue:
.byte 0xf3,0xc3
.size asm_AES_decrypt,.-asm_AES_decrypt
@@ -1313,10 +1313,9 @@ asm_AES_cbc_encrypt:
movl %r9d,%r9d
leaq .LAES_Te(%rip),%r14
+ leaq .LAES_Td(%rip),%r10
cmpq $0,%r9
- jne .Lcbc_picked_te
- leaq .LAES_Td(%rip),%r14
-.Lcbc_picked_te:
+ cmoveq %r10,%r14
movl OPENSSL_ia32cap_P(%rip),%r10d
cmpq $512,%rdx
diff --git a/linux-x86_64/crypto/aes/aesni-x86_64.S b/linux-x86_64/crypto/aes/aesni-x86_64.S
index 5709a2d0..a90e9350 100644
--- a/linux-x86_64/crypto/aes/aesni-x86_64.S
+++ b/linux-x86_64/crypto/aes/aesni-x86_64.S
@@ -1032,11 +1032,10 @@ aesni_ctr32_encrypt_blocks:
.align 16
.Lctr32_bulk:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
@@ -1045,7 +1044,7 @@ aesni_ctr32_encrypt_blocks:
movdqu (%rcx),%xmm0
movl 12(%r8),%r8d
pxor %xmm0,%xmm2
- movl 12(%rcx),%r11d
+ movl 12(%rcx),%ebp
movdqa %xmm2,0(%rsp)
bswapl %r8d
movdqa %xmm2,%xmm3
@@ -1061,8 +1060,8 @@ aesni_ctr32_encrypt_blocks:
leaq 2(%r8),%rdx
bswapl %eax
bswapl %edx
- xorl %r11d,%eax
- xorl %r11d,%edx
+ xorl %ebp,%eax
+ xorl %ebp,%edx
.byte 102,15,58,34,216,3
leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
@@ -1071,25 +1070,25 @@ aesni_ctr32_encrypt_blocks:
movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%eax
+ xorl %ebp,%eax
bswapl %r10d
.byte 102,15,58,34,232,3
- xorl %r11d,%r10d
+ xorl %ebp,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
movl 240(%rcx),%eax
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
- xorl %r11d,%r10d
+ xorl %ebp,%r10d
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
movl OPENSSL_ia32cap_P+4(%rip),%r10d
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
andl $71303168,%r10d
movl %r9d,112+12(%rsp)
@@ -1113,7 +1112,7 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_6x:
shll $4,%eax
movl $48,%r10d
- bswapl %r11d
+ bswapl %ebp
leaq 32(%rcx,%rax,1),%rcx
subq %rax,%r10
jmp .Lctr32_loop6
@@ -1124,32 +1123,32 @@ aesni_ctr32_encrypt_blocks:
movups -48(%rcx,%r10,1),%xmm0
.byte 102,15,56,220,209
movl %r8d,%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,217
.byte 0x0f,0x38,0xf1,0x44,0x24,12
leal 1(%r8),%eax
.byte 102,15,56,220,225
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 0x0f,0x38,0xf1,0x44,0x24,28
.byte 102,15,56,220,233
leal 2(%r8),%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,241
.byte 0x0f,0x38,0xf1,0x44,0x24,44
leal 3(%r8),%eax
.byte 102,15,56,220,249
movups -32(%rcx,%r10,1),%xmm1
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,208
.byte 0x0f,0x38,0xf1,0x44,0x24,60
leal 4(%r8),%eax
.byte 102,15,56,220,216
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 0x0f,0x38,0xf1,0x44,0x24,76
.byte 102,15,56,220,224
leal 5(%r8),%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,232
.byte 0x0f,0x38,0xf1,0x44,0x24,92
movq %r10,%rax
@@ -1210,7 +1209,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
@@ -1223,7 +1222,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1237,7 +1236,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1251,7 +1250,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1265,7 +1264,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1279,7 +1278,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1293,7 +1292,7 @@ aesni_ctr32_encrypt_blocks:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1308,7 +1307,7 @@ aesni_ctr32_encrypt_blocks:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
.byte 102,15,56,220,224
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
@@ -1543,7 +1542,7 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_done:
xorps %xmm0,%xmm0
- xorl %r11d,%r11d
+ xorl %ebp,%ebp
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
@@ -1567,8 +1566,8 @@ aesni_ctr32_encrypt_blocks:
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
.Lctr32_epilogue:
.byte 0xf3,0xc3
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
@@ -1577,11 +1576,10 @@ aesni_ctr32_encrypt_blocks:
.type aesni_xts_encrypt,@function
.align 16
aesni_xts_encrypt:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $112,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1597,7 +1595,7 @@ aesni_xts_encrypt:
jnz .Loop_enc1_8
.byte 102,15,56,221,209
movups (%rcx),%xmm0
- movq %rcx,%r11
+ movq %rcx,%rbp
movl %r10d,%eax
shll $4,%r10d
movq %rdx,%r9
@@ -1653,9 +1651,9 @@ aesni_xts_encrypt:
jc .Lxts_enc_short
movl $16+96,%eax
- leaq 32(%r11,%r10,1),%rcx
+ leaq 32(%rbp,%r10,1),%rcx
subq %r10,%rax
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
movq %rax,%r10
leaq .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
@@ -1680,7 +1678,7 @@ aesni_xts_encrypt:
movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
.byte 102,15,56,220,233
- movups 32(%r11),%xmm0
+ movups 32(%rbp),%xmm0
leaq 96(%rdi),%rdi
pxor %xmm8,%xmm7
@@ -1689,7 +1687,7 @@ aesni_xts_encrypt:
pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
.byte 102,15,56,220,249
- movups 48(%r11),%xmm1
+ movups 48(%rbp),%xmm1
pxor %xmm9,%xmm12
.byte 102,15,56,220,208
@@ -1704,7 +1702,7 @@ aesni_xts_encrypt:
movdqa %xmm14,64(%rsp)
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups 64(%r11),%xmm0
+ movups 64(%rbp),%xmm0
movdqa %xmm8,80(%rsp)
pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_enc_loop6
@@ -1736,7 +1734,7 @@ aesni_xts_encrypt:
psrad $31,%xmm14
.byte 102,15,56,220,217
pand %xmm8,%xmm14
- movups (%r11),%xmm10
+ movups (%rbp),%xmm10
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -1804,10 +1802,10 @@ aesni_xts_encrypt:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
pxor %xmm0,%xmm15
- movups (%r11),%xmm0
+ movups (%rbp),%xmm0
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
pxor %xmm15,%xmm14
.byte 102,15,56,221,84,36,0
@@ -1834,7 +1832,7 @@ aesni_xts_encrypt:
movl $16+96,%eax
subl %r10d,%eax
- movq %r11,%rcx
+ movq %rbp,%rcx
shrl $4,%eax
.Lxts_enc_short:
@@ -1990,7 +1988,7 @@ aesni_xts_encrypt:
jnz .Lxts_enc_steal
subq %r9,%rsi
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups -16(%rsi),%xmm2
@@ -2033,8 +2031,8 @@ aesni_xts_encrypt:
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
.Lxts_enc_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -2043,11 +2041,10 @@ aesni_xts_encrypt:
.type aesni_xts_decrypt,@function
.align 16
aesni_xts_decrypt:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $112,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -2069,7 +2066,7 @@ aesni_xts_decrypt:
subq %rax,%rdx
movups (%rcx),%xmm0
- movq %rcx,%r11
+ movq %rcx,%rbp
movl %r10d,%eax
shll $4,%r10d
movq %rdx,%r9
@@ -2125,9 +2122,9 @@ aesni_xts_decrypt:
jc .Lxts_dec_short
movl $16+96,%eax
- leaq 32(%r11,%r10,1),%rcx
+ leaq 32(%rbp,%r10,1),%rcx
subq %r10,%rax
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
movq %rax,%r10
leaq .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
@@ -2152,7 +2149,7 @@ aesni_xts_decrypt:
movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
.byte 102,15,56,222,233
- movups 32(%r11),%xmm0
+ movups 32(%rbp),%xmm0
leaq 96(%rdi),%rdi
pxor %xmm8,%xmm7
@@ -2161,7 +2158,7 @@ aesni_xts_decrypt:
pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
.byte 102,15,56,222,249
- movups 48(%r11),%xmm1
+ movups 48(%rbp),%xmm1
pxor %xmm9,%xmm12
.byte 102,15,56,222,208
@@ -2176,7 +2173,7 @@ aesni_xts_decrypt:
movdqa %xmm14,64(%rsp)
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups 64(%r11),%xmm0
+ movups 64(%rbp),%xmm0
movdqa %xmm8,80(%rsp)
pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_dec_loop6
@@ -2208,7 +2205,7 @@ aesni_xts_decrypt:
psrad $31,%xmm14
.byte 102,15,56,222,217
pand %xmm8,%xmm14
- movups (%r11),%xmm10
+ movups (%rbp),%xmm10
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -2276,10 +2273,10 @@ aesni_xts_decrypt:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
pxor %xmm0,%xmm15
- movups (%r11),%xmm0
+ movups (%rbp),%xmm0
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
pxor %xmm15,%xmm14
.byte 102,15,56,223,84,36,0
@@ -2306,7 +2303,7 @@ aesni_xts_decrypt:
movl $16+96,%eax
subl %r10d,%eax
- movq %r11,%rcx
+ movq %rbp,%rcx
shrl $4,%eax
.Lxts_dec_short:
@@ -2463,7 +2460,7 @@ aesni_xts_decrypt:
jz .Lxts_dec_ret
.Lxts_dec_done2:
movq %r9,%rdx
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups (%rdi),%xmm2
@@ -2493,7 +2490,7 @@ aesni_xts_decrypt:
jnz .Lxts_dec_steal
subq %r9,%rsi
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups (%rsi),%xmm2
@@ -2536,11 +2533,827 @@ aesni_xts_decrypt:
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
.Lxts_dec_epilogue:
.byte 0xf3,0xc3
.size aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl aesni_ocb_encrypt
+.hidden aesni_ocb_encrypt
+.type aesni_ocb_encrypt,@function
+.align 32
+aesni_ocb_encrypt:
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq 8(%rax),%rbx
+ movq 8+8(%rax),%rbp
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ shll $4,%r10d
+ movups (%rcx),%xmm9
+ movups 16(%rcx,%r10,1),%xmm1
+
+ movdqu (%r9),%xmm15
+ pxor %xmm1,%xmm9
+ pxor %xmm1,%xmm15
+
+ movl $16+32,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ movups 16(%r11),%xmm1
+ subq %r10,%rax
+ movq %rax,%r10
+
+ movdqu (%rbx),%xmm10
+ movdqu (%rbp),%xmm8
+
+ testq $1,%r8
+ jnz .Locb_enc_odd
+
+ bsfq %r8,%r12
+ addq $1,%r8
+ shlq $4,%r12
+ movdqu (%rbx,%r12,1),%xmm7
+ movdqu (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+
+ call __ocb_encrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $1,%rdx
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ leaq 6(%r8),%r8
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+ shlq $4,%r12
+ shlq $4,%r13
+ shlq $4,%r14
+
+ subq $6,%rdx
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+
+ call __ocb_encrypt6
+
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $6,%rdx
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ addq $6,%rdx
+ jz .Locb_enc_done
+
+ movdqu 0(%rdi),%xmm2
+ cmpq $2,%rdx
+ jb .Locb_enc_one
+ movdqu 16(%rdi),%xmm3
+ je .Locb_enc_two
+
+ movdqu 32(%rdi),%xmm4
+ cmpq $4,%rdx
+ jb .Locb_enc_three
+ movdqu 48(%rdi),%xmm5
+ je .Locb_enc_four
+
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm7,%xmm7
+
+ call __ocb_encrypt6
+
+ movdqa %xmm14,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa %xmm10,%xmm7
+
+ call __ocb_encrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,0(%rsi)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ call __ocb_encrypt4
+
+ movdqa %xmm11,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor %xmm5,%xmm5
+
+ call __ocb_encrypt4
+
+ movdqa %xmm12,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa %xmm13,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+
+.Locb_enc_done:
+ pxor %xmm0,%xmm15
+ movdqu %xmm8,(%rbp)
+ movdqu %xmm15,(%r9)
+
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ leaq 40(%rsp),%rax
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Locb_enc_epilogue:
+ .byte 0xf3,0xc3
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,@function
+.align 32
+__ocb_encrypt6:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ movdqa %xmm10,%xmm14
+ pxor %xmm15,%xmm10
+ movdqu (%rbx,%r14,1),%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm2,%xmm8
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm3,%xmm8
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm4,%xmm8
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm14
+ pxor %xmm5,%xmm8
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm15
+ pxor %xmm6,%xmm8
+ pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm8
+ pxor %xmm15,%xmm7
+ movups 32(%r11),%xmm0
+
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ addq $6,%r8
+ pxor %xmm9,%xmm10
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm14
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm15
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ shlq $4,%r12
+ shlq $4,%r13
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Locb_enc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+ shlq $4,%r14
+
+.byte 102,65,15,56,221,210
+ movdqu (%rbx),%xmm10
+ movq %r10,%rax
+.byte 102,65,15,56,221,219
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ .byte 0xf3,0xc3
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,@function
+.align 32
+__ocb_encrypt4:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ pxor %xmm15,%xmm10
+ pxor %xmm10,%xmm11
+ pxor %xmm2,%xmm8
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm3,%xmm8
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm4,%xmm8
+ pxor %xmm12,%xmm4
+ pxor %xmm5,%xmm8
+ pxor %xmm13,%xmm5
+ movups 32(%r11),%xmm0
+
+ pxor %xmm9,%xmm10
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+ pxor %xmm9,%xmm13
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups 64(%r11),%xmm0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Locb_enc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,65,15,56,221,210
+.byte 102,65,15,56,221,219
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ .byte 0xf3,0xc3
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,@function
+.align 32
+__ocb_encrypt1:
+ pxor %xmm15,%xmm7
+ pxor %xmm9,%xmm7
+ pxor %xmm2,%xmm8
+ pxor %xmm7,%xmm2
+ movups 32(%r11),%xmm0
+
+.byte 102,15,56,220,209
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm7
+
+.byte 102,15,56,220,208
+ movups 64(%r11),%xmm0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+.byte 102,15,56,220,209
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Locb_enc_loop1
+
+.byte 102,15,56,220,209
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,15,56,221,215
+ .byte 0xf3,0xc3
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.hidden aesni_ocb_decrypt
+.type aesni_ocb_decrypt,@function
+.align 32
+aesni_ocb_decrypt:
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq 8(%rax),%rbx
+ movq 8+8(%rax),%rbp
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ shll $4,%r10d
+ movups (%rcx),%xmm9
+ movups 16(%rcx,%r10,1),%xmm1
+
+ movdqu (%r9),%xmm15
+ pxor %xmm1,%xmm9
+ pxor %xmm1,%xmm15
+
+ movl $16+32,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ movups 16(%r11),%xmm1
+ subq %r10,%rax
+ movq %rax,%r10
+
+ movdqu (%rbx),%xmm10
+ movdqu (%rbp),%xmm8
+
+ testq $1,%r8
+ jnz .Locb_dec_odd
+
+ bsfq %r8,%r12
+ addq $1,%r8
+ shlq $4,%r12
+ movdqu (%rbx,%r12,1),%xmm7
+ movdqu (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+
+ call __ocb_decrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm8
+ leaq 16(%rsi),%rsi
+ subq $1,%rdx
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ leaq 6(%r8),%r8
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+ shlq $4,%r12
+ shlq $4,%r13
+ shlq $4,%r14
+
+ subq $6,%rdx
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+
+ call __ocb_decrypt6
+
+ movups %xmm2,0(%rsi)
+ pxor %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm8
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm8
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm8
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm8
+ leaq 96(%rsi),%rsi
+ subq $6,%rdx
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ addq $6,%rdx
+ jz .Locb_dec_done
+
+ movdqu 0(%rdi),%xmm2
+ cmpq $2,%rdx
+ jb .Locb_dec_one
+ movdqu 16(%rdi),%xmm3
+ je .Locb_dec_two
+
+ movdqu 32(%rdi),%xmm4
+ cmpq $4,%rdx
+ jb .Locb_dec_three
+ movdqu 48(%rdi),%xmm5
+ je .Locb_dec_four
+
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm7,%xmm7
+
+ call __ocb_decrypt6
+
+ movdqa %xmm14,%xmm15
+ movups %xmm2,0(%rsi)
+ pxor %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm8
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm8
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm8
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa %xmm10,%xmm7
+
+ call __ocb_decrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,0(%rsi)
+ xorps %xmm2,%xmm8
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ call __ocb_decrypt4
+
+ movdqa %xmm11,%xmm15
+ movups %xmm2,0(%rsi)
+ xorps %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ xorps %xmm3,%xmm8
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor %xmm5,%xmm5
+
+ call __ocb_decrypt4
+
+ movdqa %xmm12,%xmm15
+ movups %xmm2,0(%rsi)
+ xorps %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ xorps %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ xorps %xmm4,%xmm8
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa %xmm13,%xmm15
+ movups %xmm2,0(%rsi)
+ pxor %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm8
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm8
+
+.Locb_dec_done:
+ pxor %xmm0,%xmm15
+ movdqu %xmm8,(%rbp)
+ movdqu %xmm15,(%r9)
+
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ leaq 40(%rsp),%rax
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Locb_dec_epilogue:
+ .byte 0xf3,0xc3
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,@function
+.align 32
+__ocb_decrypt6:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ movdqa %xmm10,%xmm14
+ pxor %xmm15,%xmm10
+ movdqu (%rbx,%r14,1),%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm14
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm15
+ pxor %xmm14,%xmm6
+ pxor %xmm15,%xmm7
+ movups 32(%r11),%xmm0
+
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ addq $6,%r8
+ pxor %xmm9,%xmm10
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm14
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm15
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ shlq $4,%r12
+ shlq $4,%r13
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Locb_dec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+ shlq $4,%r14
+
+.byte 102,65,15,56,223,210
+ movdqu (%rbx),%xmm10
+ movq %r10,%rax
+.byte 102,65,15,56,223,219
+.byte 102,65,15,56,223,228
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+.byte 102,65,15,56,223,255
+ .byte 0xf3,0xc3
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,@function
+.align 32
+__ocb_decrypt4:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ pxor %xmm15,%xmm10
+ pxor %xmm10,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ movups 32(%r11),%xmm0
+
+ pxor %xmm9,%xmm10
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+ pxor %xmm9,%xmm13
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups 64(%r11),%xmm0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Locb_dec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,65,15,56,223,210
+.byte 102,65,15,56,223,219
+.byte 102,65,15,56,223,228
+.byte 102,65,15,56,223,237
+ .byte 0xf3,0xc3
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,@function
+.align 32
+__ocb_decrypt1:
+ pxor %xmm15,%xmm7
+ pxor %xmm9,%xmm7
+ pxor %xmm7,%xmm2
+ movups 32(%r11),%xmm0
+
+.byte 102,15,56,222,209
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm7
+
+.byte 102,15,56,222,208
+ movups 64(%r11),%xmm0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+.byte 102,15,56,222,209
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Locb_dec_loop1
+
+.byte 102,15,56,222,209
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,15,56,223,215
+ .byte 0xf3,0xc3
+.size __ocb_decrypt1,.-__ocb_decrypt1
.globl aesni_cbc_encrypt
.hidden aesni_cbc_encrypt
.type aesni_cbc_encrypt,@function
@@ -2638,11 +3451,11 @@ aesni_cbc_encrypt:
jmp .Lcbc_ret
.align 16
.Lcbc_decrypt_bulk:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $16,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
+ movq %rcx,%rbp
movups (%r8),%xmm10
movl %r10d,%eax
cmpq $0x50,%rdx
@@ -2682,7 +3495,7 @@ aesni_cbc_encrypt:
pxor %xmm0,%xmm3
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
- xorq %r11,%r11
+ movq $-1,%rbp
cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
@@ -2698,10 +3511,10 @@ aesni_cbc_encrypt:
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
- setnc %r11b
- shlq $7,%r11
+ adcq $0,%rbp
+ andq $128,%rbp
.byte 102,68,15,56,222,201
- addq %rdi,%r11
+ addq %rdi,%rbp
movups 48-112(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
@@ -2839,18 +3652,18 @@ aesni_cbc_encrypt:
movdqu 112(%rdi),%xmm0
.byte 102,65,15,56,223,228
leaq 128(%rdi),%rdi
- movdqu 0(%r11),%xmm11
+ movdqu 0(%rbp),%xmm11
.byte 102,65,15,56,223,237
.byte 102,65,15,56,223,246
- movdqu 16(%r11),%xmm12
- movdqu 32(%r11),%xmm13
+ movdqu 16(%rbp),%xmm12
+ movdqu 32(%rbp),%xmm13
.byte 102,65,15,56,223,255
.byte 102,68,15,56,223,193
- movdqu 48(%r11),%xmm14
- movdqu 64(%r11),%xmm15
+ movdqu 48(%rbp),%xmm14
+ movdqu 64(%rbp),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
- movdqu 80(%r11),%xmm1
+ movdqu 80(%rbp),%xmm1
movups -112(%rcx),%xmm0
movups %xmm2,(%rsi)
@@ -2969,7 +3782,7 @@ aesni_cbc_encrypt:
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm14,%xmm6
- movq %r11,%rcx
+ movq %rbp,%rcx
movdqu %xmm5,48(%rsi)
pxor %xmm15,%xmm7
movl %r10d,%eax
@@ -3122,8 +3935,8 @@ aesni_cbc_encrypt:
.Lcbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
.Lcbc_ret:
.byte 0xf3,0xc3
.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
diff --git a/linux-x86_64/crypto/aes/bsaes-x86_64.S b/linux-x86_64/crypto/aes/bsaes-x86_64.S
index c5491ce4..3f3c73bb 100644
--- a/linux-x86_64/crypto/aes/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/aes/bsaes-x86_64.S
@@ -1305,15 +1305,14 @@ bsaes_cbc_encrypt:
cmpq %rax,%rbp
ja .Lcbc_dec_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
.Lcbc_dec_epilogue:
.byte 0xf3,0xc3
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
@@ -1506,15 +1505,14 @@ bsaes_ctr32_encrypt_blocks:
cmpq %rax,%rbp
ja .Lctr_enc_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
.Lctr_enc_epilogue:
.byte 0xf3,0xc3
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
@@ -1958,15 +1956,14 @@ bsaes_xts_encrypt:
cmpq %rax,%rbp
ja .Lxts_enc_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
.Lxts_enc_epilogue:
.byte 0xf3,0xc3
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
@@ -2437,15 +2434,14 @@ bsaes_xts_decrypt:
cmpq %rax,%rbp
ja .Lxts_dec_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
.Lxts_dec_epilogue:
.byte 0xf3,0xc3
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
diff --git a/linux-x86_64/crypto/bn/x86_64-mont.S b/linux-x86_64/crypto/bn/x86_64-mont.S
index 83926ad7..0d2cea2e 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont.S
@@ -9,6 +9,10 @@
.type bn_mul_mont,@function
.align 16
bn_mul_mont:
+.cfi_startproc
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
testl $3,%r9d
jnz .Lmul_enter
cmpl $8,%r9d
@@ -22,20 +26,50 @@ bn_mul_mont:
.align 16
.Lmul_enter:
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
- movl %r9d,%r9d
- leaq 2(%r9),%r10
+ negq %r9
movq %rsp,%r11
- negq %r10
- leaq (%rsp,%r10,8),%rsp
- andq $-1024,%rsp
+ leaq -16(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
- movq %r11,8(%rsp,%r9,8)
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
+.Lmul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
.Lmul_body:
movq %rdx,%r12
movq (%r8),%r8
@@ -187,51 +221,86 @@ bn_mul_mont:
sbbq $0,%rax
xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
movq %r9,%r15
+ orq %rcx,%rsi
.align 16
.Lcopy:
- movq (%rsp,%r14,8),%rsi
- movq (%rdi,%r14,8),%rcx
- xorq %rcx,%rsi
- andq %rax,%rsi
- xorq %rcx,%rsi
+ movq (%rsi,%r14,8),%rax
movq %r14,(%rsp,%r14,8)
- movq %rsi,(%rdi,%r14,8)
+ movq %rax,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz .Lcopy
movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi,8
movq $1,%rax
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_mul_mont,.-bn_mul_mont
.type bn_mul4x_mont,@function
.align 16
bn_mul4x_mont:
+.cfi_startproc
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
.Lmul4x_enter:
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
- movl %r9d,%r9d
- leaq 4(%r9),%r10
+ negq %r9
movq %rsp,%r11
- negq %r10
- leaq (%rsp,%r10,8),%rsp
- andq $-1024,%rsp
+ leaq -32(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
- movq %r11,8(%rsp,%r9,8)
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
.Lmul4x_body:
movq %rdi,16(%rsp,%r9,8)
movq %rdx,%r12
@@ -531,9 +600,11 @@ bn_mul4x_mont:
cmpq %r9,%r14
jb .Louter4x
movq 16(%rsp,%r9,8),%rdi
+ leaq -4(%r9),%r15
movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
movq 8(%rsp),%rdx
- shrq $2,%r9
+ shrq $2,%r15
leaq (%rsp),%rsi
xorq %r14,%r14
@@ -541,7 +612,6 @@ bn_mul4x_mont:
movq 16(%rsi),%rbx
movq 24(%rsi),%rbp
sbbq 8(%rcx),%rdx
- leaq -1(%r9),%r15
jmp .Lsub4x
.align 16
.Lsub4x:
@@ -569,47 +639,55 @@ bn_mul4x_mont:
movq %rbx,16(%rdi,%r14,8)
sbbq $0,%rax
- movq %rax,%xmm0
- punpcklqdq %xmm0,%xmm0
movq %rbp,24(%rdi,%r14,8)
xorq %r14,%r14
-
- movq %r9,%r15
- pxor %xmm5,%xmm5
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -4(%r9),%r15
+ orq %rcx,%rsi
+ shrq $2,%r15
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
jmp .Lcopy4x
.align 16
.Lcopy4x:
- movdqu (%rsp,%r14,1),%xmm2
- movdqu 16(%rsp,%r14,1),%xmm4
- movdqu (%rdi,%r14,1),%xmm1
- movdqu 16(%rdi,%r14,1),%xmm3
- pxor %xmm1,%xmm2
- pxor %xmm3,%xmm4
- pand %xmm0,%xmm2
- pand %xmm0,%xmm4
- pxor %xmm1,%xmm2
- pxor %xmm3,%xmm4
- movdqu %xmm2,(%rdi,%r14,1)
- movdqu %xmm4,16(%rdi,%r14,1)
- movdqa %xmm5,(%rsp,%r14,1)
- movdqa %xmm5,16(%rsp,%r14,1)
-
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
leaq 32(%r14),%r14
decq %r15
jnz .Lcopy4x
- shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi, 8
movq $1,%rax
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_mul4x_mont,.-bn_mul4x_mont
.extern bn_sqr8x_internal
.hidden bn_sqr8x_internal
@@ -617,14 +695,23 @@ bn_mul4x_mont:
.type bn_sqr8x_mont,@function
.align 32
bn_sqr8x_mont:
-.Lsqr8x_enter:
+.cfi_startproc
movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lsqr8x_enter:
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
+.Lsqr8x_prologue:
movl %r9d,%r10d
shll $3,%r9d
@@ -637,30 +724,49 @@ bn_sqr8x_mont:
leaq -64(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lsqr8x_sp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -64(%rbp,%r9,2),%rbp
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -64(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
.Lsqr8x_sp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
+.Lsqr8x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
movq %r9,%r10
negq %r9
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
.Lsqr8x_body:
.byte 102,72,15,110,209
@@ -707,6 +813,7 @@ bn_sqr8x_mont:
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
jmp .Lsqr8x_cond_copy
.align 32
@@ -736,14 +843,22 @@ bn_sqr8x_mont:
movq $1,%rax
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lsqr8x_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_sqr8x_mont,.-bn_sqr8x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
diff --git a/linux-x86_64/crypto/bn/x86_64-mont5.S b/linux-x86_64/crypto/bn/x86_64-mont5.S
index 5d7502c3..33ca3c43 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont5.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont5.S
@@ -9,30 +9,64 @@
.type bn_mul_mont_gather5,@function
.align 64
bn_mul_mont_gather5:
+.cfi_startproc
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
testl $7,%r9d
jnz .Lmul_enter
jmp .Lmul4x_enter
.align 16
.Lmul_enter:
- movl %r9d,%r9d
- movq %rsp,%rax
movd 8(%rsp),%xmm5
- leaq .Linc(%rip),%r10
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -280(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
- leaq 2(%r9),%r11
- negq %r11
- leaq -264(%rsp,%r11,8),%rsp
- andq $-1024,%rsp
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.Lmul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ leaq .Linc(%rip),%r10
movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
.Lmul_body:
+
leaq 128(%rdx),%r12
movdqa 0(%r10),%xmm0
movdqa 16(%r10),%xmm1
@@ -371,45 +405,64 @@ bn_mul_mont_gather5:
sbbq $0,%rax
xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
movq %r9,%r15
+ orq %rcx,%rsi
.align 16
.Lcopy:
- movq (%rsp,%r14,8),%rsi
- movq (%rdi,%r14,8),%rcx
- xorq %rcx,%rsi
- andq %rax,%rsi
- xorq %rcx,%rsi
+ movq (%rsi,%r14,8),%rax
movq %r14,(%rsp,%r14,8)
- movq %rsi,(%rdi,%r14,8)
+ movq %rax,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz .Lcopy
movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi,8
movq $1,%rax
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
.type bn_mul4x_mont_gather5,@function
.align 32
bn_mul4x_mont_gather5:
-.Lmul4x_enter:
+.cfi_startproc
.byte 0x67
movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmul4x_enter:
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
+.Lmul4x_prologue:
.byte 0x67
shll $3,%r9d
@@ -426,43 +479,70 @@ bn_mul4x_mont_gather5:
leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lmul4xsp_alt
- subq %r11,%rsp
- leaq -320(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
leaq 4096-320(,%r9,2),%r10
- leaq -320(%rsp,%r9,2),%rsp
+ leaq -320(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
.Lmul4xsp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
negq %r9
movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
.Lmul4x_body:
call mul4x_internal
movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq $1,%rax
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.type mul4x_internal,@function
@@ -995,13 +1075,22 @@ mul4x_internal:
.type bn_power5,@function
.align 32
bn_power5:
+.cfi_startproc
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
+.Lpower5_prologue:
shll $3,%r9d
leal (%r9,%r9,2),%r10d
@@ -1016,24 +1105,41 @@ bn_power5:
leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lpwr_sp_alt
- subq %r11,%rsp
- leaq -320(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
leaq 4096-320(,%r9,2),%r10
- leaq -320(%rsp,%r9,2),%rsp
+ leaq -320(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
.Lpwr_sp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
+
movq %r9,%r10
negq %r9
@@ -1048,6 +1154,7 @@ bn_power5:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
.Lpower5_body:
.byte 102,72,15,110,207
.byte 102,72,15,110,209
@@ -1074,16 +1181,25 @@ bn_power5:
call mul4x_internal
movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq $1,%rax
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lpower5_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_power5,.-bn_power5
.globl bn_sqr8x_internal
@@ -1936,14 +2052,23 @@ bn_from_montgomery:
.type bn_from_mont8x,@function
.align 32
bn_from_mont8x:
+.cfi_startproc
.byte 0x67
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
+.Lfrom_prologue:
shll $3,%r9d
leaq (%r9,%r9,2),%r10
@@ -1958,24 +2083,41 @@ bn_from_mont8x:
leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lfrom_sp_alt
- subq %r11,%rsp
- leaq -320(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
leaq 4096-320(,%r9,2),%r10
- leaq -320(%rsp,%r9,2),%rsp
+ leaq -320(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
.Lfrom_sp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lfrom_page_walk
+ jmp .Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lfrom_page_walk
+.Lfrom_page_walk_done:
+
movq %r9,%r10
negq %r9
@@ -1990,6 +2132,7 @@ bn_from_mont8x:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
.Lfrom_body:
movq %r9,%r11
leaq 48(%rsp),%rax
@@ -2025,11 +2168,12 @@ bn_from_mont8x:
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
- movq 40(%rsp),%rsi
jmp .Lfrom_mont_zero
.align 32
.Lfrom_mont_zero:
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movdqa %xmm0,0(%rax)
movdqa %xmm0,16(%rax)
movdqa %xmm0,32(%rax)
@@ -2040,14 +2184,22 @@ bn_from_mont8x:
movq $1,%rax
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lfrom_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
.globl bn_scatter5
.hidden bn_scatter5
diff --git a/linux-x86_64/crypto/chacha/chacha-x86_64.S b/linux-x86_64/crypto/chacha/chacha-x86_64.S
index e994940a..25ec715f 100644
--- a/linux-x86_64/crypto/chacha/chacha-x86_64.S
+++ b/linux-x86_64/crypto/chacha/chacha-x86_64.S
@@ -23,6 +23,15 @@
.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
.Lsigma:
.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.globl ChaCha20_ctr32
.hidden ChaCha20_ctr32
@@ -42,6 +51,7 @@ ChaCha20_ctr32:
pushq %r14
pushq %r15
subq $64+24,%rsp
+.Lctr32_body:
movdqu (%rcx),%xmm1
@@ -279,13 +289,14 @@ ChaCha20_ctr32:
jnz .Loop_tail
.Ldone:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq 64+24+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lno_data:
.byte 0xf3,0xc3
.size ChaCha20_ctr32,.-ChaCha20_ctr32
@@ -293,18 +304,12 @@ ChaCha20_ctr32:
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
+ movq %rsp,%r9
cmpq $128,%rdx
ja .LChaCha20_4x
.Ldo_sse3_after_all:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- subq $64+24,%rsp
+ subq $64+8,%rsp
movdqa .Lsigma(%rip),%xmm0
movdqu (%rcx),%xmm1
movdqu 16(%rcx),%xmm2
@@ -316,7 +321,7 @@ ChaCha20_ssse3:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- movl $10,%ebp
+ movq $10,%r8
jmp .Loop_ssse3
.align 32
@@ -326,7 +331,7 @@ ChaCha20_ssse3:
movdqa 16(%rsp),%xmm1
movdqa 32(%rsp),%xmm2
paddd 48(%rsp),%xmm3
- movl $10,%ebp
+ movq $10,%r8
movdqa %xmm3,48(%rsp)
jmp .Loop_ssse3
@@ -375,7 +380,7 @@ ChaCha20_ssse3:
pshufd $78,%xmm2,%xmm2
pshufd $147,%xmm1,%xmm1
pshufd $57,%xmm3,%xmm3
- decl %ebp
+ decq %r8
jnz .Loop_ssse3
paddd 0(%rsp),%xmm0
paddd 16(%rsp),%xmm1
@@ -412,31 +417,27 @@ ChaCha20_ssse3:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- xorq %rbx,%rbx
+ xorq %r8,%r8
.Loop_tail_ssse3:
- movzbl (%rsi,%rbx,1),%eax
- movzbl (%rsp,%rbx,1),%ecx
- leaq 1(%rbx),%rbx
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
xorl %ecx,%eax
- movb %al,-1(%rdi,%rbx,1)
+ movb %al,-1(%rdi,%r8,1)
decq %rdx
jnz .Loop_tail_ssse3
.Ldone_ssse3:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq (%r9),%rsp
+.Lssse3_epilogue:
.byte 0xf3,0xc3
.size ChaCha20_ssse3,.-ChaCha20_ssse3
.type ChaCha20_4x,@function
.align 32
ChaCha20_4x:
.LChaCha20_4x:
+ movq %rsp,%r9
movq %r10,%r11
shrq $32,%r10
testq $32,%r10
@@ -449,8 +450,7 @@ ChaCha20_4x:
je .Ldo_sse3_after_all
.Lproceed4x:
- leaq -120(%rsp),%r11
- subq $0x148+0,%rsp
+ subq $0x140+8,%rsp
movdqa .Lsigma(%rip),%xmm11
movdqu (%rcx),%xmm15
movdqu 16(%rcx),%xmm7
@@ -977,18 +977,18 @@ ChaCha20_4x:
jnz .Loop_tail4x
.Ldone4x:
- addq $0x148+0,%rsp
+ leaq (%r9),%rsp
+.L4x_epilogue:
.byte 0xf3,0xc3
.size ChaCha20_4x,.-ChaCha20_4x
.type ChaCha20_8x,@function
.align 32
ChaCha20_8x:
.LChaCha20_8x:
- movq %rsp,%r10
+ movq %rsp,%r9
subq $0x280+8,%rsp
andq $-32,%rsp
vzeroupper
- movq %r10,640(%rsp)
@@ -1579,7 +1579,8 @@ ChaCha20_8x:
.Ldone8x:
vzeroall
- movq 640(%rsp),%rsp
+ leaq (%r9),%rsp
+.L8x_epilogue:
.byte 0xf3,0xc3
.size ChaCha20_8x,.-ChaCha20_8x
#endif
diff --git a/linux-x86_64/crypto/modes/ghash-x86_64.S b/linux-x86_64/crypto/modes/ghash-x86_64.S
index b6ca45ff..8842c279 100644
--- a/linux-x86_64/crypto/modes/ghash-x86_64.S
+++ b/linux-x86_64/crypto/modes/ghash-x86_64.S
@@ -11,6 +11,10 @@ gcm_gmult_4bit:
pushq %rbx
pushq %rbp
pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $280,%rsp
.Lgmult_prologue:
movzbq 15(%rdi),%r8
@@ -87,8 +91,9 @@ gcm_gmult_4bit:
movq %r8,8(%rdi)
movq %r9,(%rdi)
- movq 16(%rsp),%rbx
- leaq 24(%rsp),%rsp
+ leaq 280+48(%rsp),%rsi
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lgmult_epilogue:
.byte 0xf3,0xc3
.size gcm_gmult_4bit,.-gcm_gmult_4bit
@@ -648,14 +653,14 @@ gcm_ghash_4bit:
movq %r8,8(%rdi)
movq %r9,(%rdi)
- leaq 280(%rsp),%rsi
- movq 0(%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ leaq 280+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq 0(%rsi),%rsp
.Lghash_epilogue:
.byte 0xf3,0xc3
.size gcm_ghash_4bit,.-gcm_ghash_4bit
diff --git a/linux-x86_64/crypto/sha/sha1-x86_64.S b/linux-x86_64/crypto/sha/sha1-x86_64.S
index d830b534..567bdfd1 100644
--- a/linux-x86_64/crypto/sha/sha1-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha1-x86_64.S
@@ -1241,14 +1241,13 @@ sha1_block_data_order:
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- movq %rsp,%rax
+ movq %rsp,%r11
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
leaq -64(%rsp),%rsp
- movq %rax,%r14
andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
@@ -1256,7 +1255,7 @@ _ssse3_shortcut:
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -1268,8 +1267,8 @@ _ssse3_shortcut:
xorl %edx,%edi
andl %edi,%esi
- movdqa 64(%r11),%xmm6
- movdqa -64(%r11),%xmm9
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -1345,7 +1344,7 @@ _ssse3_shortcut:
pslld $2,%xmm9
pxor %xmm10,%xmm4
xorl %ebp,%edx
- movdqa -64(%r11),%xmm10
+ movdqa -64(%r14),%xmm10
roll $5,%ecx
addl %edi,%ebx
andl %edx,%esi
@@ -1406,7 +1405,7 @@ _ssse3_shortcut:
pslld $2,%xmm10
pxor %xmm8,%xmm5
xorl %eax,%ebp
- movdqa -32(%r11),%xmm8
+ movdqa -32(%r14),%xmm8
roll $5,%edx
addl %edi,%ecx
andl %ebp,%esi
@@ -1467,7 +1466,7 @@ _ssse3_shortcut:
pslld $2,%xmm8
pxor %xmm9,%xmm6
xorl %ebx,%eax
- movdqa -32(%r11),%xmm9
+ movdqa -32(%r14),%xmm9
roll $5,%ebp
addl %edi,%edx
andl %eax,%esi
@@ -1528,7 +1527,7 @@ _ssse3_shortcut:
pslld $2,%xmm9
pxor %xmm10,%xmm7
xorl %ecx,%ebx
- movdqa -32(%r11),%xmm10
+ movdqa -32(%r14),%xmm10
roll $5,%eax
addl %edi,%ebp
andl %ebx,%esi
@@ -1639,7 +1638,7 @@ _ssse3_shortcut:
pxor %xmm3,%xmm2
addl %esi,%eax
xorl %edx,%edi
- movdqa 0(%r11),%xmm10
+ movdqa 0(%r14),%xmm10
rorl $7,%ecx
paddd %xmm1,%xmm9
addl %ebx,%eax
@@ -1874,7 +1873,7 @@ _ssse3_shortcut:
pxor %xmm0,%xmm7
roll $5,%ebx
addl %esi,%eax
- movdqa 32(%r11),%xmm9
+ movdqa 32(%r14),%xmm9
xorl %ecx,%edi
paddd %xmm6,%xmm8
xorl %edx,%ecx
@@ -2165,8 +2164,8 @@ _ssse3_shortcut:
addl %edx,%ecx
cmpq %r10,%r9
je .Ldone_ssse3
- movdqa 64(%r11),%xmm6
- movdqa -64(%r11),%xmm9
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -2403,13 +2402,12 @@ _ssse3_shortcut:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+ movq -32(%r11),%r13
+ movq -24(%r11),%r12
+ movq -16(%r11),%rbp
+ movq -8(%r11),%rbx
+ leaq (%r11),%rsp
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
@@ -2417,7 +2415,7 @@ _ssse3_shortcut:
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
- movq %rsp,%rax
+ movq %rsp,%r11
pushq %rbx
pushq %rbp
pushq %r12
@@ -2425,7 +2423,6 @@ _avx_shortcut:
pushq %r14
leaq -64(%rsp),%rsp
vzeroupper
- movq %rax,%r14
andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
@@ -2433,7 +2430,7 @@ _avx_shortcut:
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -2445,8 +2442,8 @@ _avx_shortcut:
xorl %edx,%edi
andl %edi,%esi
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
vmovdqu 0(%r9),%xmm0
vmovdqu 16(%r9),%xmm1
vmovdqu 32(%r9),%xmm2
@@ -2571,7 +2568,7 @@ _avx_shortcut:
vpxor %xmm10,%xmm5,%xmm5
xorl %eax,%ebp
shldl $5,%edx,%edx
- vmovdqa -32(%r11),%xmm11
+ vmovdqa -32(%r14),%xmm11
addl %edi,%ecx
andl %ebp,%esi
xorl %eax,%ebp
@@ -2784,7 +2781,7 @@ _avx_shortcut:
addl %esi,%eax
xorl %edx,%edi
vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r11),%xmm11
+ vmovdqa 0(%r14),%xmm11
shrdl $7,%ecx,%ecx
addl %ebx,%eax
vpxor %xmm8,%xmm2,%xmm2
@@ -3003,7 +3000,7 @@ _avx_shortcut:
movl %ebx,%edi
xorl %edx,%esi
vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r11),%xmm11
+ vmovdqa 32(%r14),%xmm11
shldl $5,%ebx,%ebx
addl %esi,%eax
vpxor %xmm8,%xmm7,%xmm7
@@ -3282,8 +3279,8 @@ _avx_shortcut:
addl %edx,%ecx
cmpq %r10,%r9
je .Ldone_avx
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
vmovdqu 0(%r9),%xmm0
vmovdqu 16(%r9),%xmm1
vmovdqu 32(%r9),%xmm2
@@ -3519,13 +3516,12 @@ _avx_shortcut:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+ movq -32(%r11),%r13
+ movq -24(%r11),%r12
+ movq -16(%r11),%rbp
+ movq -8(%r11),%rbx
+ leaq (%r11),%rsp
.Lepilogue_avx:
.byte 0xf3,0xc3
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
diff --git a/linux-x86_64/crypto/sha/sha256-x86_64.S b/linux-x86_64/crypto/sha/sha256-x86_64.S
index 445b497e..273b7a5e 100644
--- a/linux-x86_64/crypto/sha/sha256-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha256-x86_64.S
@@ -19,13 +19,13 @@ sha256_block_data_order:
je .Lavx_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $64+32,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -33,7 +33,7 @@ sha256_block_data_order:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,64+24(%rsp)
.Lprologue:
movl 0(%rdi),%eax
@@ -1698,13 +1698,13 @@ sha256_block_data_order:
jb .Lloop
movq 64+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue:
.byte 0xf3,0xc3
.size sha256_block_data_order,.-sha256_block_data_order
@@ -1755,13 +1755,13 @@ K256:
.align 64
sha256_block_data_order_ssse3:
.Lssse3_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -1769,7 +1769,7 @@ sha256_block_data_order_ssse3:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,64+24(%rsp)
.Lprologue_ssse3:
movl 0(%rdi),%eax
@@ -2836,13 +2836,13 @@ sha256_block_data_order_ssse3:
jb .Lloop_ssse3
movq 64+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue_ssse3:
.byte 0xf3,0xc3
.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
@@ -2850,13 +2850,13 @@ sha256_block_data_order_ssse3:
.align 64
sha256_block_data_order_avx:
.Lavx_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -2864,7 +2864,7 @@ sha256_block_data_order_avx:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,64+24(%rsp)
.Lprologue_avx:
vzeroupper
@@ -3893,13 +3893,13 @@ sha256_block_data_order_avx:
movq 64+24(%rsp),%rsi
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue_avx:
.byte 0xf3,0xc3
.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
diff --git a/linux-x86_64/crypto/sha/sha512-x86_64.S b/linux-x86_64/crypto/sha/sha512-x86_64.S
index d65743fd..f272b640 100644
--- a/linux-x86_64/crypto/sha/sha512-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha512-x86_64.S
@@ -19,13 +19,13 @@ sha512_block_data_order:
orl %r9d,%r10d
cmpl $1342177792,%r10d
je .Lavx_shortcut
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $128+32,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -33,7 +33,7 @@ sha512_block_data_order:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,128+24(%rsp)
.Lprologue:
movq 0(%rdi),%rax
@@ -1698,13 +1698,13 @@ sha512_block_data_order:
jb .Lloop
movq 128+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue:
.byte 0xf3,0xc3
.size sha512_block_data_order,.-sha512_block_data_order
@@ -1799,13 +1799,13 @@ K512:
.align 64
sha512_block_data_order_xop:
.Lxop_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -1813,7 +1813,7 @@ sha512_block_data_order_xop:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,128+24(%rsp)
.Lprologue_xop:
vzeroupper
@@ -2868,13 +2868,13 @@ sha512_block_data_order_xop:
movq 128+24(%rsp),%rsi
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue_xop:
.byte 0xf3,0xc3
.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
@@ -2882,13 +2882,13 @@ sha512_block_data_order_xop:
.align 64
sha512_block_data_order_avx:
.Lavx_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -2896,7 +2896,7 @@ sha512_block_data_order_avx:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,128+24(%rsp)
.Lprologue_avx:
vzeroupper
@@ -4015,13 +4015,13 @@ sha512_block_data_order_avx:
movq 128+24(%rsp),%rsi
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lepilogue_avx:
.byte 0xf3,0xc3
.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
diff --git a/mac-x86/crypto/bn/x86-mont.S b/mac-x86/crypto/bn/x86-mont.S
index 234034b0..5c13ca4d 100644
--- a/mac-x86/crypto/bn/x86-mont.S
+++ b/mac-x86/crypto/bn/x86-mont.S
@@ -16,39 +16,54 @@ L_bn_mul_mont_begin:
jl L000just_leave
leal 20(%esp),%esi
leal 24(%esp),%edx
- movl %esp,%ebp
addl $2,%edi
negl %edi
- leal -32(%esp,%edi,4),%esp
+ leal -32(%esp,%edi,4),%ebp
negl %edi
- movl %esp,%eax
+ movl %ebp,%eax
subl %edx,%eax
andl $2047,%eax
- subl %eax,%esp
- xorl %esp,%edx
+ subl %eax,%ebp
+ xorl %ebp,%edx
andl $2048,%edx
xorl $2048,%edx
- subl %edx,%esp
- andl $-64,%esp
+ subl %edx,%ebp
+ andl $-64,%ebp
+ movl %esp,%eax
+ subl %ebp,%eax
+ andl $-4096,%eax
+ movl %esp,%edx
+ leal (%ebp,%eax,1),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja L001page_walk
+ jmp L002page_walk_done
+.align 4,0x90
+L001page_walk:
+ leal -4096(%esp),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja L001page_walk
+L002page_walk_done:
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
- movl 12(%esi),%edx
+ movl 12(%esi),%ebp
movl 16(%esi),%esi
movl (%esi),%esi
movl %eax,4(%esp)
movl %ebx,8(%esp)
movl %ecx,12(%esp)
- movl %edx,16(%esp)
+ movl %ebp,16(%esp)
movl %esi,20(%esp)
leal -3(%edi),%ebx
- movl %ebp,24(%esp)
- call L001PIC_me_up
-L001PIC_me_up:
+ movl %edx,24(%esp)
+ call L003PIC_me_up
+L003PIC_me_up:
popl %eax
- movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax
+ movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
btl $26,(%eax)
- jnc L002non_sse2
+ jnc L004non_sse2
movl $-1,%eax
movd %eax,%mm7
movl 8(%esp),%esi
@@ -72,7 +87,7 @@ L001PIC_me_up:
psrlq $32,%mm3
incl %ecx
.align 4,0x90
-L0031st:
+L0051st:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -87,7 +102,7 @@ L0031st:
psrlq $32,%mm3
leal 1(%ecx),%ecx
cmpl %ebx,%ecx
- jl L0031st
+ jl L0051st
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -101,7 +116,7 @@ L0031st:
paddq %mm2,%mm3
movq %mm3,32(%esp,%ebx,4)
incl %edx
-L004outer:
+L006outer:
xorl %ecx,%ecx
movd (%edi,%edx,4),%mm4
movd (%esi),%mm5
@@ -123,7 +138,7 @@ L004outer:
paddq %mm6,%mm2
incl %ecx
decl %ebx
-L005inner:
+L007inner:
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
paddq %mm0,%mm2
@@ -140,7 +155,7 @@ L005inner:
paddq %mm6,%mm2
decl %ebx
leal 1(%ecx),%ecx
- jnz L005inner
+ jnz L007inner
movl %ecx,%ebx
pmuludq %mm4,%mm0
pmuludq %mm5,%mm1
@@ -158,11 +173,11 @@ L005inner:
movq %mm3,32(%esp,%ebx,4)
leal 1(%edx),%edx
cmpl %ebx,%edx
- jle L004outer
+ jle L006outer
emms
- jmp L006common_tail
+ jmp L008common_tail
.align 4,0x90
-L002non_sse2:
+L004non_sse2:
movl 8(%esp),%esi
leal 1(%ebx),%ebp
movl 12(%esp),%edi
@@ -173,12 +188,12 @@ L002non_sse2:
leal 4(%edi,%ebx,4),%eax
orl %edx,%ebp
movl (%edi),%edi
- jz L007bn_sqr_mont
+ jz L009bn_sqr_mont
movl %eax,28(%esp)
movl (%esi),%eax
xorl %edx,%edx
.align 4,0x90
-L008mull:
+L010mull:
movl %edx,%ebp
mull %edi
addl %eax,%ebp
@@ -187,7 +202,7 @@ L008mull:
movl (%esi,%ecx,4),%eax
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl L008mull
+ jl L010mull
movl %edx,%ebp
mull %edi
movl 20(%esp),%edi
@@ -205,9 +220,9 @@ L008mull:
movl 4(%esi),%eax
adcl $0,%edx
incl %ecx
- jmp L0092ndmadd
+ jmp L0112ndmadd
.align 4,0x90
-L0101stmadd:
+L0121stmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -218,7 +233,7 @@ L0101stmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,28(%esp,%ecx,4)
- jl L0101stmadd
+ jl L0121stmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%eax
@@ -241,7 +256,7 @@ L0101stmadd:
adcl $0,%edx
movl $1,%ecx
.align 4,0x90
-L0092ndmadd:
+L0112ndmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -252,7 +267,7 @@ L0092ndmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl L0092ndmadd
+ jl L0112ndmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -268,16 +283,16 @@ L0092ndmadd:
movl %edx,32(%esp,%ebx,4)
cmpl 28(%esp),%ecx
movl %eax,36(%esp,%ebx,4)
- je L006common_tail
+ je L008common_tail
movl (%ecx),%edi
movl 8(%esp),%esi
movl %ecx,12(%esp)
xorl %ecx,%ecx
xorl %edx,%edx
movl (%esi),%eax
- jmp L0101stmadd
+ jmp L0121stmadd
.align 4,0x90
-L007bn_sqr_mont:
+L009bn_sqr_mont:
movl %ebx,(%esp)
movl %ecx,12(%esp)
movl %edi,%eax
@@ -288,7 +303,7 @@ L007bn_sqr_mont:
andl $1,%ebx
incl %ecx
.align 4,0x90
-L011sqr:
+L013sqr:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -300,7 +315,7 @@ L011sqr:
cmpl (%esp),%ecx
movl %eax,%ebx
movl %ebp,28(%esp,%ecx,4)
- jl L011sqr
+ jl L013sqr
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -324,7 +339,7 @@ L011sqr:
movl 4(%esi),%eax
movl $1,%ecx
.align 4,0x90
-L0123rdmadd:
+L0143rdmadd:
movl %edx,%ebp
mull %edi
addl 32(%esp,%ecx,4),%ebp
@@ -343,7 +358,7 @@ L0123rdmadd:
adcl $0,%edx
cmpl %ebx,%ecx
movl %ebp,24(%esp,%ecx,4)
- jl L0123rdmadd
+ jl L0143rdmadd
movl %edx,%ebp
mull %edi
addl 32(%esp,%ebx,4),%ebp
@@ -359,7 +374,7 @@ L0123rdmadd:
movl %edx,32(%esp,%ebx,4)
cmpl %ebx,%ecx
movl %eax,36(%esp,%ebx,4)
- je L006common_tail
+ je L008common_tail
movl 4(%esi,%ecx,4),%edi
leal 1(%ecx),%ecx
movl %edi,%eax
@@ -371,12 +386,12 @@ L0123rdmadd:
xorl %ebp,%ebp
cmpl %ebx,%ecx
leal 1(%ecx),%ecx
- je L013sqrlast
+ je L015sqrlast
movl %edx,%ebx
shrl $1,%edx
andl $1,%ebx
.align 4,0x90
-L014sqradd:
+L016sqradd:
movl (%esi,%ecx,4),%eax
movl %edx,%ebp
mull %edi
@@ -392,13 +407,13 @@ L014sqradd:
cmpl (%esp),%ecx
movl %ebp,28(%esp,%ecx,4)
movl %eax,%ebx
- jle L014sqradd
+ jle L016sqradd
movl %edx,%ebp
addl %edx,%edx
shrl $31,%ebp
addl %ebx,%edx
adcl $0,%ebp
-L013sqrlast:
+L015sqrlast:
movl 20(%esp),%edi
movl 16(%esp),%esi
imull 32(%esp),%edi
@@ -413,9 +428,9 @@ L013sqrlast:
adcl $0,%edx
movl $1,%ecx
movl 4(%esi),%eax
- jmp L0123rdmadd
+ jmp L0143rdmadd
.align 4,0x90
-L006common_tail:
+L008common_tail:
movl 16(%esp),%ebp
movl 4(%esp),%edi
leal 32(%esp),%esi
@@ -423,25 +438,26 @@ L006common_tail:
movl %ebx,%ecx
xorl %edx,%edx
.align 4,0x90
-L015sub:
+L017sub:
sbbl (%ebp,%edx,4),%eax
movl %eax,(%edi,%edx,4)
decl %ecx
movl 4(%esi,%edx,4),%eax
leal 1(%edx),%edx
- jge L015sub
+ jge L017sub
sbbl $0,%eax
+ andl %eax,%esi
+ notl %eax
+ movl %edi,%ebp
+ andl %eax,%ebp
+ orl %ebp,%esi
.align 4,0x90
-L016copy:
- movl (%esi,%ebx,4),%edx
- movl (%edi,%ebx,4),%ebp
- xorl %ebp,%edx
- andl %eax,%edx
- xorl %ebp,%edx
- movl %ecx,(%esi,%ebx,4)
- movl %edx,(%edi,%ebx,4)
+L018copy:
+ movl (%esi,%ebx,4),%eax
+ movl %eax,(%edi,%ebx,4)
+ movl %ecx,32(%esp,%ebx,4)
decl %ebx
- jge L016copy
+ jge L018copy
movl 24(%esp),%esp
movl $1,%eax
L000just_leave:
diff --git a/mac-x86_64/crypto/aes/aes-x86_64.S b/mac-x86_64/crypto/aes/aes-x86_64.S
index b5d188a0..52df2ae3 100644
--- a/mac-x86_64/crypto/aes/aes-x86_64.S
+++ b/mac-x86_64/crypto/aes/aes-x86_64.S
@@ -332,6 +332,7 @@ L$enc_compact_done:
.private_extern _asm_AES_encrypt
_asm_AES_encrypt:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
@@ -340,7 +341,6 @@ _asm_AES_encrypt:
pushq %r15
- movq %rsp,%r10
leaq -63(%rdx),%rcx
andq $-64,%rsp
subq %rsp,%rcx
@@ -350,7 +350,7 @@ _asm_AES_encrypt:
subq $32,%rsp
movq %rsi,16(%rsp)
- movq %r10,24(%rsp)
+ movq %rax,24(%rsp)
L$enc_prologue:
movq %rdx,%r15
@@ -382,13 +382,13 @@ L$enc_prologue:
movl %ecx,8(%r9)
movl %edx,12(%r9)
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$enc_epilogue:
.byte 0xf3,0xc3
@@ -778,6 +778,7 @@ L$dec_compact_done:
.private_extern _asm_AES_decrypt
_asm_AES_decrypt:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
@@ -786,7 +787,6 @@ _asm_AES_decrypt:
pushq %r15
- movq %rsp,%r10
leaq -63(%rdx),%rcx
andq $-64,%rsp
subq %rsp,%rcx
@@ -796,7 +796,7 @@ _asm_AES_decrypt:
subq $32,%rsp
movq %rsi,16(%rsp)
- movq %r10,24(%rsp)
+ movq %rax,24(%rsp)
L$dec_prologue:
movq %rdx,%r15
@@ -830,13 +830,13 @@ L$dec_prologue:
movl %ecx,8(%r9)
movl %edx,12(%r9)
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$dec_epilogue:
.byte 0xf3,0xc3
@@ -1312,10 +1312,9 @@ L$cbc_prologue:
movl %r9d,%r9d
leaq L$AES_Te(%rip),%r14
+ leaq L$AES_Td(%rip),%r10
cmpq $0,%r9
- jne L$cbc_picked_te
- leaq L$AES_Td(%rip),%r14
-L$cbc_picked_te:
+ cmoveq %r10,%r14
movl _OPENSSL_ia32cap_P(%rip),%r10d
cmpq $512,%rdx
diff --git a/mac-x86_64/crypto/aes/aesni-x86_64.S b/mac-x86_64/crypto/aes/aesni-x86_64.S
index 3d98fa12..4e3b7d06 100644
--- a/mac-x86_64/crypto/aes/aesni-x86_64.S
+++ b/mac-x86_64/crypto/aes/aesni-x86_64.S
@@ -1031,11 +1031,10 @@ L$oop_enc1_7:
.p2align 4
L$ctr32_bulk:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $128,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
@@ -1044,7 +1043,7 @@ L$ctr32_bulk:
movdqu (%rcx),%xmm0
movl 12(%r8),%r8d
pxor %xmm0,%xmm2
- movl 12(%rcx),%r11d
+ movl 12(%rcx),%ebp
movdqa %xmm2,0(%rsp)
bswapl %r8d
movdqa %xmm2,%xmm3
@@ -1060,8 +1059,8 @@ L$ctr32_bulk:
leaq 2(%r8),%rdx
bswapl %eax
bswapl %edx
- xorl %r11d,%eax
- xorl %r11d,%edx
+ xorl %ebp,%eax
+ xorl %ebp,%edx
.byte 102,15,58,34,216,3
leaq 3(%r8),%rax
movdqa %xmm3,16(%rsp)
@@ -1070,25 +1069,25 @@ L$ctr32_bulk:
movq %r10,%rdx
leaq 4(%r8),%r10
movdqa %xmm4,32(%rsp)
- xorl %r11d,%eax
+ xorl %ebp,%eax
bswapl %r10d
.byte 102,15,58,34,232,3
- xorl %r11d,%r10d
+ xorl %ebp,%r10d
movdqa %xmm5,48(%rsp)
leaq 5(%r8),%r9
movl %r10d,64+12(%rsp)
bswapl %r9d
leaq 6(%r8),%r10
movl 240(%rcx),%eax
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
bswapl %r10d
movl %r9d,80+12(%rsp)
- xorl %r11d,%r10d
+ xorl %ebp,%r10d
leaq 7(%r8),%r9
movl %r10d,96+12(%rsp)
bswapl %r9d
movl _OPENSSL_ia32cap_P+4(%rip),%r10d
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
andl $71303168,%r10d
movl %r9d,112+12(%rsp)
@@ -1112,7 +1111,7 @@ L$ctr32_bulk:
L$ctr32_6x:
shll $4,%eax
movl $48,%r10d
- bswapl %r11d
+ bswapl %ebp
leaq 32(%rcx,%rax,1),%rcx
subq %rax,%r10
jmp L$ctr32_loop6
@@ -1123,32 +1122,32 @@ L$ctr32_loop6:
movups -48(%rcx,%r10,1),%xmm0
.byte 102,15,56,220,209
movl %r8d,%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,217
.byte 0x0f,0x38,0xf1,0x44,0x24,12
leal 1(%r8),%eax
.byte 102,15,56,220,225
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 0x0f,0x38,0xf1,0x44,0x24,28
.byte 102,15,56,220,233
leal 2(%r8),%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,241
.byte 0x0f,0x38,0xf1,0x44,0x24,44
leal 3(%r8),%eax
.byte 102,15,56,220,249
movups -32(%rcx,%r10,1),%xmm1
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,208
.byte 0x0f,0x38,0xf1,0x44,0x24,60
leal 4(%r8),%eax
.byte 102,15,56,220,216
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 0x0f,0x38,0xf1,0x44,0x24,76
.byte 102,15,56,220,224
leal 5(%r8),%eax
- xorl %r11d,%eax
+ xorl %ebp,%eax
.byte 102,15,56,220,232
.byte 0x0f,0x38,0xf1,0x44,0x24,92
movq %r10,%rax
@@ -1209,7 +1208,7 @@ L$ctr32_loop8:
bswapl %r9d
movups 32-128(%rcx),%xmm0
.byte 102,15,56,220,225
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
nop
.byte 102,15,56,220,233
movl %r9d,0+12(%rsp)
@@ -1222,7 +1221,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1236,7 +1235,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1250,7 +1249,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1264,7 +1263,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1278,7 +1277,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,208
.byte 102,15,56,220,216
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,224
.byte 102,15,56,220,232
@@ -1292,7 +1291,7 @@ L$ctr32_loop8:
bswapl %r9d
.byte 102,15,56,220,209
.byte 102,15,56,220,217
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
.byte 0x66,0x90
.byte 102,15,56,220,225
.byte 102,15,56,220,233
@@ -1307,7 +1306,7 @@ L$ctr32_loop8:
.byte 102,15,56,220,208
.byte 102,15,56,220,216
.byte 102,15,56,220,224
- xorl %r11d,%r9d
+ xorl %ebp,%r9d
movdqu 0(%rdi),%xmm10
.byte 102,15,56,220,232
movl %r9d,112+12(%rsp)
@@ -1542,7 +1541,7 @@ L$ctr32_loop3:
L$ctr32_done:
xorps %xmm0,%xmm0
- xorl %r11d,%r11d
+ xorl %ebp,%ebp
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
@@ -1566,8 +1565,8 @@ L$ctr32_done:
pxor %xmm14,%xmm14
movaps %xmm0,112(%rsp)
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
L$ctr32_epilogue:
.byte 0xf3,0xc3
@@ -1576,11 +1575,10 @@ L$ctr32_epilogue:
.p2align 4
_aesni_xts_encrypt:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $112,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -1596,7 +1594,7 @@ L$oop_enc1_8:
jnz L$oop_enc1_8
.byte 102,15,56,221,209
movups (%rcx),%xmm0
- movq %rcx,%r11
+ movq %rcx,%rbp
movl %r10d,%eax
shll $4,%r10d
movq %rdx,%r9
@@ -1652,9 +1650,9 @@ L$oop_enc1_8:
jc L$xts_enc_short
movl $16+96,%eax
- leaq 32(%r11,%r10,1),%rcx
+ leaq 32(%rbp,%r10,1),%rcx
subq %r10,%rax
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
movq %rax,%r10
leaq L$xts_magic(%rip),%r8
jmp L$xts_enc_grandloop
@@ -1679,7 +1677,7 @@ L$xts_enc_grandloop:
movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
.byte 102,15,56,220,233
- movups 32(%r11),%xmm0
+ movups 32(%rbp),%xmm0
leaq 96(%rdi),%rdi
pxor %xmm8,%xmm7
@@ -1688,7 +1686,7 @@ L$xts_enc_grandloop:
pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
.byte 102,15,56,220,249
- movups 48(%r11),%xmm1
+ movups 48(%rbp),%xmm1
pxor %xmm9,%xmm12
.byte 102,15,56,220,208
@@ -1703,7 +1701,7 @@ L$xts_enc_grandloop:
movdqa %xmm14,64(%rsp)
.byte 102,15,56,220,240
.byte 102,15,56,220,248
- movups 64(%r11),%xmm0
+ movups 64(%rbp),%xmm0
movdqa %xmm8,80(%rsp)
pshufd $0x5f,%xmm15,%xmm9
jmp L$xts_enc_loop6
@@ -1735,7 +1733,7 @@ L$xts_enc_loop6:
psrad $31,%xmm14
.byte 102,15,56,220,217
pand %xmm8,%xmm14
- movups (%r11),%xmm10
+ movups (%rbp),%xmm10
.byte 102,15,56,220,225
.byte 102,15,56,220,233
.byte 102,15,56,220,241
@@ -1803,10 +1801,10 @@ L$xts_enc_loop6:
.byte 102,15,56,220,225
.byte 102,15,56,220,233
pxor %xmm0,%xmm15
- movups (%r11),%xmm0
+ movups (%rbp),%xmm0
.byte 102,15,56,220,241
.byte 102,15,56,220,249
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
pxor %xmm15,%xmm14
.byte 102,15,56,221,84,36,0
@@ -1833,7 +1831,7 @@ L$xts_enc_loop6:
movl $16+96,%eax
subl %r10d,%eax
- movq %r11,%rcx
+ movq %rbp,%rcx
shrl $4,%eax
L$xts_enc_short:
@@ -1989,7 +1987,7 @@ L$xts_enc_steal:
jnz L$xts_enc_steal
subq %r9,%rsi
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups -16(%rsi),%xmm2
@@ -2032,8 +2030,8 @@ L$xts_enc_ret:
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
L$xts_enc_epilogue:
.byte 0xf3,0xc3
@@ -2042,11 +2040,10 @@ L$xts_enc_epilogue:
.p2align 4
_aesni_xts_decrypt:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $112,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
movups (%r9),%xmm2
movl 240(%r8),%eax
movl 240(%rcx),%r10d
@@ -2068,7 +2065,7 @@ L$oop_enc1_11:
subq %rax,%rdx
movups (%rcx),%xmm0
- movq %rcx,%r11
+ movq %rcx,%rbp
movl %r10d,%eax
shll $4,%r10d
movq %rdx,%r9
@@ -2124,9 +2121,9 @@ L$oop_enc1_11:
jc L$xts_dec_short
movl $16+96,%eax
- leaq 32(%r11,%r10,1),%rcx
+ leaq 32(%rbp,%r10,1),%rcx
subq %r10,%rax
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
movq %rax,%r10
leaq L$xts_magic(%rip),%r8
jmp L$xts_dec_grandloop
@@ -2151,7 +2148,7 @@ L$xts_dec_grandloop:
movdqa 96(%rsp),%xmm9
pxor %xmm14,%xmm6
.byte 102,15,56,222,233
- movups 32(%r11),%xmm0
+ movups 32(%rbp),%xmm0
leaq 96(%rdi),%rdi
pxor %xmm8,%xmm7
@@ -2160,7 +2157,7 @@ L$xts_dec_grandloop:
pxor %xmm9,%xmm11
movdqa %xmm10,0(%rsp)
.byte 102,15,56,222,249
- movups 48(%r11),%xmm1
+ movups 48(%rbp),%xmm1
pxor %xmm9,%xmm12
.byte 102,15,56,222,208
@@ -2175,7 +2172,7 @@ L$xts_dec_grandloop:
movdqa %xmm14,64(%rsp)
.byte 102,15,56,222,240
.byte 102,15,56,222,248
- movups 64(%r11),%xmm0
+ movups 64(%rbp),%xmm0
movdqa %xmm8,80(%rsp)
pshufd $0x5f,%xmm15,%xmm9
jmp L$xts_dec_loop6
@@ -2207,7 +2204,7 @@ L$xts_dec_loop6:
psrad $31,%xmm14
.byte 102,15,56,222,217
pand %xmm8,%xmm14
- movups (%r11),%xmm10
+ movups (%rbp),%xmm10
.byte 102,15,56,222,225
.byte 102,15,56,222,233
.byte 102,15,56,222,241
@@ -2275,10 +2272,10 @@ L$xts_dec_loop6:
.byte 102,15,56,222,225
.byte 102,15,56,222,233
pxor %xmm0,%xmm15
- movups (%r11),%xmm0
+ movups (%rbp),%xmm0
.byte 102,15,56,222,241
.byte 102,15,56,222,249
- movups 16(%r11),%xmm1
+ movups 16(%rbp),%xmm1
pxor %xmm15,%xmm14
.byte 102,15,56,223,84,36,0
@@ -2305,7 +2302,7 @@ L$xts_dec_loop6:
movl $16+96,%eax
subl %r10d,%eax
- movq %r11,%rcx
+ movq %rbp,%rcx
shrl $4,%eax
L$xts_dec_short:
@@ -2462,7 +2459,7 @@ L$xts_dec_done:
jz L$xts_dec_ret
L$xts_dec_done2:
movq %r9,%rdx
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups (%rdi),%xmm2
@@ -2492,7 +2489,7 @@ L$xts_dec_steal:
jnz L$xts_dec_steal
subq %r9,%rsi
- movq %r11,%rcx
+ movq %rbp,%rcx
movl %r10d,%eax
movups (%rsi),%xmm2
@@ -2535,11 +2532,827 @@ L$xts_dec_ret:
movaps %xmm0,96(%rsp)
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
L$xts_dec_epilogue:
.byte 0xf3,0xc3
+.globl _aesni_ocb_encrypt
+.private_extern _aesni_ocb_encrypt
+
+.p2align 5
+_aesni_ocb_encrypt:
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq 8(%rax),%rbx
+ movq 8+8(%rax),%rbp
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ shll $4,%r10d
+ movups (%rcx),%xmm9
+ movups 16(%rcx,%r10,1),%xmm1
+
+ movdqu (%r9),%xmm15
+ pxor %xmm1,%xmm9
+ pxor %xmm1,%xmm15
+
+ movl $16+32,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ movups 16(%r11),%xmm1
+ subq %r10,%rax
+ movq %rax,%r10
+
+ movdqu (%rbx),%xmm10
+ movdqu (%rbp),%xmm8
+
+ testq $1,%r8
+ jnz L$ocb_enc_odd
+
+ bsfq %r8,%r12
+ addq $1,%r8
+ shlq $4,%r12
+ movdqu (%rbx,%r12,1),%xmm7
+ movdqu (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+
+ call __ocb_encrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $1,%rdx
+ jz L$ocb_enc_done
+
+L$ocb_enc_odd:
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ leaq 6(%r8),%r8
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+ shlq $4,%r12
+ shlq $4,%r13
+ shlq $4,%r14
+
+ subq $6,%rdx
+ jc L$ocb_enc_short
+ jmp L$ocb_enc_grandloop
+
+.p2align 5
+L$ocb_enc_grandloop:
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+
+ call __ocb_encrypt6
+
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ leaq 96(%rsi),%rsi
+ subq $6,%rdx
+ jnc L$ocb_enc_grandloop
+
+L$ocb_enc_short:
+ addq $6,%rdx
+ jz L$ocb_enc_done
+
+ movdqu 0(%rdi),%xmm2
+ cmpq $2,%rdx
+ jb L$ocb_enc_one
+ movdqu 16(%rdi),%xmm3
+ je L$ocb_enc_two
+
+ movdqu 32(%rdi),%xmm4
+ cmpq $4,%rdx
+ jb L$ocb_enc_three
+ movdqu 48(%rdi),%xmm5
+ je L$ocb_enc_four
+
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm7,%xmm7
+
+ call __ocb_encrypt6
+
+ movdqa %xmm14,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+
+ jmp L$ocb_enc_done
+
+.p2align 4
+L$ocb_enc_one:
+ movdqa %xmm10,%xmm7
+
+ call __ocb_encrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,0(%rsi)
+ jmp L$ocb_enc_done
+
+.p2align 4
+L$ocb_enc_two:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ call __ocb_encrypt4
+
+ movdqa %xmm11,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+
+ jmp L$ocb_enc_done
+
+.p2align 4
+L$ocb_enc_three:
+ pxor %xmm5,%xmm5
+
+ call __ocb_encrypt4
+
+ movdqa %xmm12,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+
+ jmp L$ocb_enc_done
+
+.p2align 4
+L$ocb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa %xmm13,%xmm15
+ movups %xmm2,0(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+
+L$ocb_enc_done:
+ pxor %xmm0,%xmm15
+ movdqu %xmm8,(%rbp)
+ movdqu %xmm15,(%r9)
+
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ leaq 40(%rsp),%rax
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$ocb_enc_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+__ocb_encrypt6:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ movdqa %xmm10,%xmm14
+ pxor %xmm15,%xmm10
+ movdqu (%rbx,%r14,1),%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm2,%xmm8
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm3,%xmm8
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm4,%xmm8
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm14
+ pxor %xmm5,%xmm8
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm15
+ pxor %xmm6,%xmm8
+ pxor %xmm14,%xmm6
+ pxor %xmm7,%xmm8
+ pxor %xmm15,%xmm7
+ movups 32(%r11),%xmm0
+
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ addq $6,%r8
+ pxor %xmm9,%xmm10
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+.byte 102,15,56,220,241
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm14
+.byte 102,15,56,220,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm15
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups 64(%r11),%xmm0
+ shlq $4,%r12
+ shlq $4,%r13
+ jmp L$ocb_enc_loop6
+
+.p2align 5
+L$ocb_enc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ocb_enc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups 16(%r11),%xmm1
+ shlq $4,%r14
+
+.byte 102,65,15,56,221,210
+ movdqu (%rbx),%xmm10
+ movq %r10,%rax
+.byte 102,65,15,56,221,219
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+__ocb_encrypt4:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ pxor %xmm15,%xmm10
+ pxor %xmm10,%xmm11
+ pxor %xmm2,%xmm8
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm3,%xmm8
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm4,%xmm8
+ pxor %xmm12,%xmm4
+ pxor %xmm5,%xmm8
+ pxor %xmm13,%xmm5
+ movups 32(%r11),%xmm0
+
+ pxor %xmm9,%xmm10
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+ pxor %xmm9,%xmm13
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups 64(%r11),%xmm0
+ jmp L$ocb_enc_loop4
+
+.p2align 5
+L$ocb_enc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ocb_enc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,65,15,56,221,210
+.byte 102,65,15,56,221,219
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+__ocb_encrypt1:
+ pxor %xmm15,%xmm7
+ pxor %xmm9,%xmm7
+ pxor %xmm2,%xmm8
+ pxor %xmm7,%xmm2
+ movups 32(%r11),%xmm0
+
+.byte 102,15,56,220,209
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm7
+
+.byte 102,15,56,220,208
+ movups 64(%r11),%xmm0
+ jmp L$ocb_enc_loop1
+
+.p2align 5
+L$ocb_enc_loop1:
+.byte 102,15,56,220,209
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,220,208
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ocb_enc_loop1
+
+.byte 102,15,56,220,209
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,15,56,221,215
+ .byte 0xf3,0xc3
+
+
+.globl _aesni_ocb_decrypt
+.private_extern _aesni_ocb_decrypt
+
+.p2align 5
+_aesni_ocb_decrypt:
+ leaq (%rsp),%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ movq 8(%rax),%rbx
+ movq 8+8(%rax),%rbp
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ shll $4,%r10d
+ movups (%rcx),%xmm9
+ movups 16(%rcx,%r10,1),%xmm1
+
+ movdqu (%r9),%xmm15
+ pxor %xmm1,%xmm9
+ pxor %xmm1,%xmm15
+
+ movl $16+32,%eax
+ leaq 32(%r11,%r10,1),%rcx
+ movups 16(%r11),%xmm1
+ subq %r10,%rax
+ movq %rax,%r10
+
+ movdqu (%rbx),%xmm10
+ movdqu (%rbp),%xmm8
+
+ testq $1,%r8
+ jnz L$ocb_dec_odd
+
+ bsfq %r8,%r12
+ addq $1,%r8
+ shlq $4,%r12
+ movdqu (%rbx,%r12,1),%xmm7
+ movdqu (%rdi),%xmm2
+ leaq 16(%rdi),%rdi
+
+ call __ocb_decrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm8
+ leaq 16(%rsi),%rsi
+ subq $1,%rdx
+ jz L$ocb_dec_done
+
+L$ocb_dec_odd:
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ leaq 6(%r8),%r8
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+ shlq $4,%r12
+ shlq $4,%r13
+ shlq $4,%r14
+
+ subq $6,%rdx
+ jc L$ocb_dec_short
+ jmp L$ocb_dec_grandloop
+
+.p2align 5
+L$ocb_dec_grandloop:
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ leaq 96(%rdi),%rdi
+
+ call __ocb_decrypt6
+
+ movups %xmm2,0(%rsi)
+ pxor %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm8
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm8
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm8
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm8
+ leaq 96(%rsi),%rsi
+ subq $6,%rdx
+ jnc L$ocb_dec_grandloop
+
+L$ocb_dec_short:
+ addq $6,%rdx
+ jz L$ocb_dec_done
+
+ movdqu 0(%rdi),%xmm2
+ cmpq $2,%rdx
+ jb L$ocb_dec_one
+ movdqu 16(%rdi),%xmm3
+ je L$ocb_dec_two
+
+ movdqu 32(%rdi),%xmm4
+ cmpq $4,%rdx
+ jb L$ocb_dec_three
+ movdqu 48(%rdi),%xmm5
+ je L$ocb_dec_four
+
+ movdqu 64(%rdi),%xmm6
+ pxor %xmm7,%xmm7
+
+ call __ocb_decrypt6
+
+ movdqa %xmm14,%xmm15
+ movups %xmm2,0(%rsi)
+ pxor %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm8
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm8
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm8
+
+ jmp L$ocb_dec_done
+
+.p2align 4
+L$ocb_dec_one:
+ movdqa %xmm10,%xmm7
+
+ call __ocb_decrypt1
+
+ movdqa %xmm7,%xmm15
+ movups %xmm2,0(%rsi)
+ xorps %xmm2,%xmm8
+ jmp L$ocb_dec_done
+
+.p2align 4
+L$ocb_dec_two:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ call __ocb_decrypt4
+
+ movdqa %xmm11,%xmm15
+ movups %xmm2,0(%rsi)
+ xorps %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ xorps %xmm3,%xmm8
+
+ jmp L$ocb_dec_done
+
+.p2align 4
+L$ocb_dec_three:
+ pxor %xmm5,%xmm5
+
+ call __ocb_decrypt4
+
+ movdqa %xmm12,%xmm15
+ movups %xmm2,0(%rsi)
+ xorps %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ xorps %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ xorps %xmm4,%xmm8
+
+ jmp L$ocb_dec_done
+
+.p2align 4
+L$ocb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa %xmm13,%xmm15
+ movups %xmm2,0(%rsi)
+ pxor %xmm2,%xmm8
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm8
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm8
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm8
+
+L$ocb_dec_done:
+ pxor %xmm0,%xmm15
+ movdqu %xmm8,(%rbp)
+ movdqu %xmm15,(%r9)
+
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ leaq 40(%rsp),%rax
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+L$ocb_dec_epilogue:
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+__ocb_decrypt6:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ movdqa %xmm10,%xmm14
+ pxor %xmm15,%xmm10
+ movdqu (%rbx,%r14,1),%xmm15
+ pxor %xmm10,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm14
+ pxor %xmm13,%xmm5
+ pxor %xmm14,%xmm15
+ pxor %xmm14,%xmm6
+ pxor %xmm15,%xmm7
+ movups 32(%r11),%xmm0
+
+ leaq 1(%r8),%r12
+ leaq 3(%r8),%r13
+ leaq 5(%r8),%r14
+ addq $6,%r8
+ pxor %xmm9,%xmm10
+ bsfq %r12,%r12
+ bsfq %r13,%r13
+ bsfq %r14,%r14
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+.byte 102,15,56,222,241
+ pxor %xmm9,%xmm13
+ pxor %xmm9,%xmm14
+.byte 102,15,56,222,249
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm15
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups 64(%r11),%xmm0
+ shlq $4,%r12
+ shlq $4,%r13
+ jmp L$ocb_dec_loop6
+
+.p2align 5
+L$ocb_dec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ocb_dec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups 16(%r11),%xmm1
+ shlq $4,%r14
+
+.byte 102,65,15,56,223,210
+ movdqu (%rbx),%xmm10
+ movq %r10,%rax
+.byte 102,65,15,56,223,219
+.byte 102,65,15,56,223,228
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+.byte 102,65,15,56,223,255
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+__ocb_decrypt4:
+ pxor %xmm9,%xmm15
+ movdqu (%rbx,%r12,1),%xmm11
+ movdqa %xmm10,%xmm12
+ movdqu (%rbx,%r13,1),%xmm13
+ pxor %xmm15,%xmm10
+ pxor %xmm10,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm12
+ pxor %xmm11,%xmm3
+ pxor %xmm12,%xmm13
+ pxor %xmm12,%xmm4
+ pxor %xmm13,%xmm5
+ movups 32(%r11),%xmm0
+
+ pxor %xmm9,%xmm10
+ pxor %xmm9,%xmm11
+ pxor %xmm9,%xmm12
+ pxor %xmm9,%xmm13
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 48(%r11),%xmm1
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups 64(%r11),%xmm0
+ jmp L$ocb_dec_loop4
+
+.p2align 5
+L$ocb_dec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ocb_dec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,65,15,56,223,210
+.byte 102,65,15,56,223,219
+.byte 102,65,15,56,223,228
+.byte 102,65,15,56,223,237
+ .byte 0xf3,0xc3
+
+
+
+.p2align 5
+__ocb_decrypt1:
+ pxor %xmm15,%xmm7
+ pxor %xmm9,%xmm7
+ pxor %xmm7,%xmm2
+ movups 32(%r11),%xmm0
+
+.byte 102,15,56,222,209
+ movups 48(%r11),%xmm1
+ pxor %xmm9,%xmm7
+
+.byte 102,15,56,222,208
+ movups 64(%r11),%xmm0
+ jmp L$ocb_dec_loop1
+
+.p2align 5
+L$ocb_dec_loop1:
+.byte 102,15,56,222,209
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+
+.byte 102,15,56,222,208
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz L$ocb_dec_loop1
+
+.byte 102,15,56,222,209
+ movups 16(%r11),%xmm1
+ movq %r10,%rax
+
+.byte 102,15,56,223,215
+ .byte 0xf3,0xc3
+
.globl _aesni_cbc_encrypt
.private_extern _aesni_cbc_encrypt
@@ -2637,11 +3450,11 @@ L$oop_dec1_16:
jmp L$cbc_ret
.p2align 4
L$cbc_decrypt_bulk:
- leaq (%rsp),%rax
+ leaq (%rsp),%r11
pushq %rbp
subq $16,%rsp
andq $-16,%rsp
- leaq -8(%rax),%rbp
+ movq %rcx,%rbp
movups (%r8),%xmm10
movl %r10d,%eax
cmpq $0x50,%rdx
@@ -2681,7 +3494,7 @@ L$cbc_dec_loop8_enter:
pxor %xmm0,%xmm3
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
- xorq %r11,%r11
+ movq $-1,%rbp
cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
@@ -2697,10 +3510,10 @@ L$cbc_dec_loop8_enter:
.byte 102,15,56,222,241
.byte 102,15,56,222,249
.byte 102,68,15,56,222,193
- setnc %r11b
- shlq $7,%r11
+ adcq $0,%rbp
+ andq $128,%rbp
.byte 102,68,15,56,222,201
- addq %rdi,%r11
+ addq %rdi,%rbp
movups 48-112(%rcx),%xmm1
.byte 102,15,56,222,208
.byte 102,15,56,222,216
@@ -2838,18 +3651,18 @@ L$cbc_dec_done:
movdqu 112(%rdi),%xmm0
.byte 102,65,15,56,223,228
leaq 128(%rdi),%rdi
- movdqu 0(%r11),%xmm11
+ movdqu 0(%rbp),%xmm11
.byte 102,65,15,56,223,237
.byte 102,65,15,56,223,246
- movdqu 16(%r11),%xmm12
- movdqu 32(%r11),%xmm13
+ movdqu 16(%rbp),%xmm12
+ movdqu 32(%rbp),%xmm13
.byte 102,65,15,56,223,255
.byte 102,68,15,56,223,193
- movdqu 48(%r11),%xmm14
- movdqu 64(%r11),%xmm15
+ movdqu 48(%rbp),%xmm14
+ movdqu 64(%rbp),%xmm15
.byte 102,69,15,56,223,202
movdqa %xmm0,%xmm10
- movdqu 80(%r11),%xmm1
+ movdqu 80(%rbp),%xmm1
movups -112(%rcx),%xmm0
movups %xmm2,(%rsi)
@@ -2968,7 +3781,7 @@ L$cbc_dec_loop6_enter:
pxor %xmm13,%xmm5
movdqu %xmm4,32(%rsi)
pxor %xmm14,%xmm6
- movq %r11,%rcx
+ movq %rbp,%rcx
movdqu %xmm5,48(%rsi)
pxor %xmm15,%xmm7
movl %r10d,%eax
@@ -3121,8 +3934,8 @@ L$cbc_dec_tail_partial:
L$cbc_dec_ret:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
- leaq (%rbp),%rsp
- popq %rbp
+ movq -8(%r11),%rbp
+ leaq (%r11),%rsp
L$cbc_ret:
.byte 0xf3,0xc3
diff --git a/mac-x86_64/crypto/aes/bsaes-x86_64.S b/mac-x86_64/crypto/aes/bsaes-x86_64.S
index ad802e3d..6e679c18 100644
--- a/mac-x86_64/crypto/aes/bsaes-x86_64.S
+++ b/mac-x86_64/crypto/aes/bsaes-x86_64.S
@@ -1302,15 +1302,14 @@ L$cbc_dec_bzero:
cmpq %rax,%rbp
ja L$cbc_dec_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
L$cbc_dec_epilogue:
.byte 0xf3,0xc3
@@ -1503,15 +1502,14 @@ L$ctr_enc_bzero:
cmpq %rax,%rbp
ja L$ctr_enc_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
L$ctr_enc_epilogue:
.byte 0xf3,0xc3
@@ -1955,15 +1953,14 @@ L$xts_enc_bzero:
cmpq %rax,%rbp
ja L$xts_enc_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
L$xts_enc_epilogue:
.byte 0xf3,0xc3
@@ -2434,15 +2431,14 @@ L$xts_dec_bzero:
cmpq %rax,%rbp
ja L$xts_dec_bzero
- leaq (%rbp),%rsp
- movq 72(%rsp),%r15
- movq 80(%rsp),%r14
- movq 88(%rsp),%r13
- movq 96(%rsp),%r12
- movq 104(%rsp),%rbx
- movq 112(%rsp),%rax
- leaq 120(%rsp),%rsp
- movq %rax,%rbp
+ leaq 120(%rbp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbx
+ movq -8(%rax),%rbp
+ leaq (%rax),%rsp
L$xts_dec_epilogue:
.byte 0xf3,0xc3
diff --git a/mac-x86_64/crypto/bn/x86_64-mont.S b/mac-x86_64/crypto/bn/x86_64-mont.S
index 51e5d199..41a09267 100644
--- a/mac-x86_64/crypto/bn/x86_64-mont.S
+++ b/mac-x86_64/crypto/bn/x86_64-mont.S
@@ -8,6 +8,10 @@
.p2align 4
_bn_mul_mont:
+
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
testl $3,%r9d
jnz L$mul_enter
cmpl $8,%r9d
@@ -21,20 +25,50 @@ _bn_mul_mont:
.p2align 4
L$mul_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movl %r9d,%r9d
- leaq 2(%r9),%r10
+
+ negq %r9
movq %rsp,%r11
- negq %r10
- leaq (%rsp,%r10,8),%rsp
- andq $-1024,%rsp
+ leaq -16(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+ jmp L$mul_page_walk_done
+
+.p2align 4
+L$mul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+L$mul_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
- movq %r11,8(%rsp,%r9,8)
L$mul_body:
movq %rdx,%r12
movq (%r8),%r8
@@ -186,51 +220,86 @@ L$sub: sbbq (%rcx,%r14,8),%rax
sbbq $0,%rax
xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
movq %r9,%r15
+ orq %rcx,%rsi
.p2align 4
L$copy:
- movq (%rsp,%r14,8),%rsi
- movq (%rdi,%r14,8),%rcx
- xorq %rcx,%rsi
- andq %rax,%rsi
- xorq %rcx,%rsi
+ movq (%rsi,%r14,8),%rax
movq %r14,(%rsp,%r14,8)
- movq %rsi,(%rdi,%r14,8)
+ movq %rax,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz L$copy
movq 8(%rsp,%r9,8),%rsi
+
movq $1,%rax
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$mul_epilogue:
.byte 0xf3,0xc3
+
.p2align 4
bn_mul4x_mont:
+
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
L$mul4x_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- movl %r9d,%r9d
- leaq 4(%r9),%r10
+
+ negq %r9
movq %rsp,%r11
- negq %r10
- leaq (%rsp,%r10,8),%rsp
- andq $-1024,%rsp
+ leaq -32(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul4x_page_walk
+ jmp L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
- movq %r11,8(%rsp,%r9,8)
L$mul4x_body:
movq %rdi,16(%rsp,%r9,8)
movq %rdx,%r12
@@ -530,9 +599,11 @@ L$inner4x:
cmpq %r9,%r14
jb L$outer4x
movq 16(%rsp,%r9,8),%rdi
+ leaq -4(%r9),%r15
movq 0(%rsp),%rax
+ pxor %xmm0,%xmm0
movq 8(%rsp),%rdx
- shrq $2,%r9
+ shrq $2,%r15
leaq (%rsp),%rsi
xorq %r14,%r14
@@ -540,7 +611,6 @@ L$inner4x:
movq 16(%rsi),%rbx
movq 24(%rsi),%rbp
sbbq 8(%rcx),%rdx
- leaq -1(%r9),%r15
jmp L$sub4x
.p2align 4
L$sub4x:
@@ -568,62 +638,79 @@ L$sub4x:
movq %rbx,16(%rdi,%r14,8)
sbbq $0,%rax
- movq %rax,%xmm0
- punpcklqdq %xmm0,%xmm0
movq %rbp,24(%rdi,%r14,8)
xorq %r14,%r14
-
- movq %r9,%r15
- pxor %xmm5,%xmm5
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
+ leaq -4(%r9),%r15
+ orq %rcx,%rsi
+ shrq $2,%r15
+
+ movdqu (%rsi),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,(%rdi)
jmp L$copy4x
.p2align 4
L$copy4x:
- movdqu (%rsp,%r14,1),%xmm2
- movdqu 16(%rsp,%r14,1),%xmm4
- movdqu (%rdi,%r14,1),%xmm1
- movdqu 16(%rdi,%r14,1),%xmm3
- pxor %xmm1,%xmm2
- pxor %xmm3,%xmm4
- pand %xmm0,%xmm2
- pand %xmm0,%xmm4
- pxor %xmm1,%xmm2
- pxor %xmm3,%xmm4
- movdqu %xmm2,(%rdi,%r14,1)
- movdqu %xmm4,16(%rdi,%r14,1)
- movdqa %xmm5,(%rsp,%r14,1)
- movdqa %xmm5,16(%rsp,%r14,1)
-
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqu 32(%rsi,%r14,1),%xmm1
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
+ movdqa %xmm0,32(%rsp,%r14,1)
+ movdqu %xmm1,32(%rdi,%r14,1)
leaq 32(%r14),%r14
decq %r15
jnz L$copy4x
- shlq $2,%r9
+ movdqu 16(%rsi,%r14,1),%xmm2
+ movdqa %xmm0,16(%rsp,%r14,1)
+ movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
+
movq $1,%rax
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+
+ movq -40(%rsi),%r14
+
+ movq -32(%rsi),%r13
+
+ movq -24(%rsi),%r12
+
+ movq -16(%rsi),%rbp
+
+ movq -8(%rsi),%rbx
+
+ leaq (%rsi),%rsp
+
L$mul4x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
bn_sqr8x_mont:
-L$sqr8x_enter:
+
movq %rsp,%rax
+
+L$sqr8x_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+L$sqr8x_prologue:
+
movl %r9d,%r10d
shll $3,%r9d
shlq $3+2,%r10
@@ -635,30 +722,49 @@ L$sqr8x_enter:
leaq -64(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$sqr8x_sp_alt
- subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -64(%rbp,%r9,2),%rbp
jmp L$sqr8x_sp_done
.p2align 5
L$sqr8x_sp_alt:
leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -64(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
L$sqr8x_sp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$sqr8x_page_walk
+ jmp L$sqr8x_page_walk_done
+
+.p2align 4
+L$sqr8x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$sqr8x_page_walk
+L$sqr8x_page_walk_done:
+
movq %r9,%r10
negq %r9
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$sqr8x_body:
.byte 102,72,15,110,209
@@ -705,6 +811,7 @@ L$sqr8x_sub:
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
+
jmp L$sqr8x_cond_copy
.p2align 5
@@ -734,15 +841,23 @@ L$sqr8x_cond_copy:
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$sqr8x_epilogue:
.byte 0xf3,0xc3
+
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 4
#endif
diff --git a/mac-x86_64/crypto/bn/x86_64-mont5.S b/mac-x86_64/crypto/bn/x86_64-mont5.S
index a154cc8d..24b56de2 100644
--- a/mac-x86_64/crypto/bn/x86_64-mont5.S
+++ b/mac-x86_64/crypto/bn/x86_64-mont5.S
@@ -8,30 +8,64 @@
.p2align 6
_bn_mul_mont_gather5:
+
+ movl %r9d,%r9d
+ movq %rsp,%rax
+
testl $7,%r9d
jnz L$mul_enter
jmp L$mul4x_enter
.p2align 4
L$mul_enter:
- movl %r9d,%r9d
- movq %rsp,%rax
movd 8(%rsp),%xmm5
- leaq L$inc(%rip),%r10
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
- leaq 2(%r9),%r11
- negq %r11
- leaq -264(%rsp,%r11,8),%rsp
- andq $-1024,%rsp
+ negq %r9
+ movq %rsp,%r11
+ leaq -280(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+ jmp L$mul_page_walk_done
+
+L$mul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja L$mul_page_walk
+L$mul_page_walk_done:
+
+ leaq L$inc(%rip),%r10
movq %rax,8(%rsp,%r9,8)
+
L$mul_body:
+
leaq 128(%rdx),%r12
movdqa 0(%r10),%xmm0
movdqa 16(%r10),%xmm1
@@ -370,46 +404,65 @@ L$sub: sbbq (%rcx,%r14,8),%rax
sbbq $0,%rax
xorq %r14,%r14
+ andq %rax,%rsi
+ notq %rax
+ movq %rdi,%rcx
+ andq %rax,%rcx
movq %r9,%r15
+ orq %rcx,%rsi
.p2align 4
L$copy:
- movq (%rsp,%r14,8),%rsi
- movq (%rdi,%r14,8),%rcx
- xorq %rcx,%rsi
- andq %rax,%rsi
- xorq %rcx,%rsi
+ movq (%rsi,%r14,8),%rax
movq %r14,(%rsp,%r14,8)
- movq %rsi,(%rdi,%r14,8)
+ movq %rax,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz L$copy
movq 8(%rsp,%r9,8),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mul_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
bn_mul4x_mont_gather5:
-L$mul4x_enter:
+
.byte 0x67
movq %rsp,%rax
+
+L$mul4x_enter:
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+L$mul4x_prologue:
+
.byte 0x67
shll $3,%r9d
leaq (%r9,%r9,2),%r10
@@ -425,46 +478,73 @@ L$mul4x_enter:
leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$mul4xsp_alt
- subq %r11,%rsp
- leaq -320(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
jmp L$mul4xsp_done
.p2align 5
L$mul4xsp_alt:
leaq 4096-320(,%r9,2),%r10
- leaq -320(%rsp,%r9,2),%rsp
+ leaq -320(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
L$mul4xsp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mul4x_page_walk
+ jmp L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
negq %r9
movq %rax,40(%rsp)
+
L$mul4x_body:
call mul4x_internal
movq 40(%rsp),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$mul4x_epilogue:
.byte 0xf3,0xc3
+
.p2align 5
mul4x_internal:
shlq $5,%r9
@@ -994,14 +1074,23 @@ L$inner4x:
.p2align 5
_bn_power5:
+
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+L$power5_prologue:
+
shll $3,%r9d
leal (%r9,%r9,2),%r10d
negq %r9
@@ -1015,24 +1104,41 @@ _bn_power5:
leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$pwr_sp_alt
- subq %r11,%rsp
- leaq -320(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
jmp L$pwr_sp_done
.p2align 5
L$pwr_sp_alt:
leaq 4096-320(,%r9,2),%r10
- leaq -320(%rsp,%r9,2),%rsp
+ leaq -320(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
L$pwr_sp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$pwr_page_walk
+ jmp L$pwr_page_walk_done
+
+L$pwr_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$pwr_page_walk
+L$pwr_page_walk_done:
+
movq %r9,%r10
negq %r9
@@ -1047,6 +1153,7 @@ L$pwr_sp_done:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$power5_body:
.byte 102,72,15,110,207
.byte 102,72,15,110,209
@@ -1073,18 +1180,27 @@ L$power5_body:
call mul4x_internal
movq 40(%rsp),%rsi
+
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$power5_epilogue:
.byte 0xf3,0xc3
+
.globl _bn_sqr8x_internal
.private_extern _bn_sqr8x_internal
.private_extern _bn_sqr8x_internal
@@ -1935,15 +2051,24 @@ _bn_from_montgomery:
.p2align 5
bn_from_mont8x:
+
.byte 0x67
movq %rsp,%rax
+
pushq %rbx
+
pushq %rbp
+
pushq %r12
+
pushq %r13
+
pushq %r14
+
pushq %r15
+L$from_prologue:
+
shll $3,%r9d
leaq (%r9,%r9,2),%r10
negq %r9
@@ -1957,24 +2082,41 @@ bn_from_mont8x:
leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb L$from_sp_alt
- subq %r11,%rsp
- leaq -320(%rsp,%r9,2),%rsp
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
jmp L$from_sp_done
.p2align 5
L$from_sp_alt:
leaq 4096-320(,%r9,2),%r10
- leaq -320(%rsp,%r9,2),%rsp
+ leaq -320(%rbp,%r9,2),%rbp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
- subq %r11,%rsp
+ subq %r11,%rbp
L$from_sp_done:
- andq $-64,%rsp
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$from_page_walk
+ jmp L$from_page_walk_done
+
+L$from_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja L$from_page_walk
+L$from_page_walk_done:
+
movq %r9,%r10
negq %r9
@@ -1989,6 +2131,7 @@ L$from_sp_done:
movq %r8,32(%rsp)
movq %rax,40(%rsp)
+
L$from_body:
movq %r9,%r11
leaq 48(%rsp),%rax
@@ -2024,11 +2167,12 @@ L$mul_by_1:
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
- movq 40(%rsp),%rsi
jmp L$from_mont_zero
.p2align 5
L$from_mont_zero:
+ movq 40(%rsp),%rsi
+
movdqa %xmm0,0(%rax)
movdqa %xmm0,16(%rax)
movdqa %xmm0,32(%rax)
@@ -2039,15 +2183,23 @@ L$from_mont_zero:
movq $1,%rax
movq -48(%rsi),%r15
+
movq -40(%rsi),%r14
+
movq -32(%rsi),%r13
+
movq -24(%rsi),%r12
+
movq -16(%rsi),%rbp
+
movq -8(%rsi),%rbx
+
leaq (%rsi),%rsp
+
L$from_epilogue:
.byte 0xf3,0xc3
+
.globl _bn_scatter5
.private_extern _bn_scatter5
diff --git a/mac-x86_64/crypto/chacha/chacha-x86_64.S b/mac-x86_64/crypto/chacha/chacha-x86_64.S
index c3554c8d..51c0caa7 100644
--- a/mac-x86_64/crypto/chacha/chacha-x86_64.S
+++ b/mac-x86_64/crypto/chacha/chacha-x86_64.S
@@ -22,6 +22,15 @@ L$rot24:
.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
L$sigma:
.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.p2align 6
+L$zeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+L$fourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+L$incz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+L$sixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.globl _ChaCha20_ctr32
.private_extern _ChaCha20_ctr32
@@ -41,6 +50,7 @@ _ChaCha20_ctr32:
pushq %r14
pushq %r15
subq $64+24,%rsp
+L$ctr32_body:
movdqu (%rcx),%xmm1
@@ -278,13 +288,14 @@ L$oop_tail:
jnz L$oop_tail
L$done:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq 64+24+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$no_data:
.byte 0xf3,0xc3
@@ -292,18 +303,12 @@ L$no_data:
.p2align 5
ChaCha20_ssse3:
L$ChaCha20_ssse3:
+ movq %rsp,%r9
cmpq $128,%rdx
ja L$ChaCha20_4x
L$do_sse3_after_all:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- subq $64+24,%rsp
+ subq $64+8,%rsp
movdqa L$sigma(%rip),%xmm0
movdqu (%rcx),%xmm1
movdqu 16(%rcx),%xmm2
@@ -315,7 +320,7 @@ L$do_sse3_after_all:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- movl $10,%ebp
+ movq $10,%r8
jmp L$oop_ssse3
.p2align 5
@@ -325,7 +330,7 @@ L$oop_outer_ssse3:
movdqa 16(%rsp),%xmm1
movdqa 32(%rsp),%xmm2
paddd 48(%rsp),%xmm3
- movl $10,%ebp
+ movq $10,%r8
movdqa %xmm3,48(%rsp)
jmp L$oop_ssse3
@@ -374,7 +379,7 @@ L$oop_ssse3:
pshufd $78,%xmm2,%xmm2
pshufd $147,%xmm1,%xmm1
pshufd $57,%xmm3,%xmm3
- decl %ebp
+ decq %r8
jnz L$oop_ssse3
paddd 0(%rsp),%xmm0
paddd 16(%rsp),%xmm1
@@ -411,31 +416,27 @@ L$tail_ssse3:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- xorq %rbx,%rbx
+ xorq %r8,%r8
L$oop_tail_ssse3:
- movzbl (%rsi,%rbx,1),%eax
- movzbl (%rsp,%rbx,1),%ecx
- leaq 1(%rbx),%rbx
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
xorl %ecx,%eax
- movb %al,-1(%rdi,%rbx,1)
+ movb %al,-1(%rdi,%r8,1)
decq %rdx
jnz L$oop_tail_ssse3
L$done_ssse3:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq (%r9),%rsp
+L$ssse3_epilogue:
.byte 0xf3,0xc3
.p2align 5
ChaCha20_4x:
L$ChaCha20_4x:
+ movq %rsp,%r9
movq %r10,%r11
shrq $32,%r10
testq $32,%r10
@@ -448,8 +449,7 @@ L$ChaCha20_4x:
je L$do_sse3_after_all
L$proceed4x:
- leaq -120(%rsp),%r11
- subq $0x148+0,%rsp
+ subq $0x140+8,%rsp
movdqa L$sigma(%rip),%xmm11
movdqu (%rcx),%xmm15
movdqu 16(%rcx),%xmm7
@@ -976,18 +976,18 @@ L$oop_tail4x:
jnz L$oop_tail4x
L$done4x:
- addq $0x148+0,%rsp
+ leaq (%r9),%rsp
+L$4x_epilogue:
.byte 0xf3,0xc3
.p2align 5
ChaCha20_8x:
L$ChaCha20_8x:
- movq %rsp,%r10
+ movq %rsp,%r9
subq $0x280+8,%rsp
andq $-32,%rsp
vzeroupper
- movq %r10,640(%rsp)
@@ -1578,7 +1578,8 @@ L$oop_tail8x:
L$done8x:
vzeroall
- movq 640(%rsp),%rsp
+ leaq (%r9),%rsp
+L$8x_epilogue:
.byte 0xf3,0xc3
#endif
diff --git a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
index 03cd8725..62d114d9 100644
--- a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
+++ b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
@@ -44,7 +44,7 @@ chacha20_poly1305_constants:
.p2align 6
poly_hash_ad_internal:
-.cfi_startproc
+
xorq %r10,%r10
xorq %r11,%r11
xorq %r12,%r12
@@ -207,7 +207,7 @@ hash_ad_tail_loop:
1:
.byte 0xf3,0xc3
-.cfi_endproc
+
.globl _chacha20_poly1305_open
@@ -215,31 +215,31 @@ hash_ad_tail_loop:
.p2align 6
_chacha20_poly1305_open:
-.cfi_startproc
+
pushq %rbp
-.cfi_adjust_cfa_offset 8
+
pushq %rbx
-.cfi_adjust_cfa_offset 8
+
pushq %r12
-.cfi_adjust_cfa_offset 8
+
pushq %r13
-.cfi_adjust_cfa_offset 8
+
pushq %r14
-.cfi_adjust_cfa_offset 8
+
pushq %r15
-.cfi_adjust_cfa_offset 8
+
pushq %r9
-.cfi_adjust_cfa_offset 8
+
subq $288 + 32,%rsp
-.cfi_adjust_cfa_offset 288 + 32
-.cfi_offset rbp, -16
-.cfi_offset rbx, -24
-.cfi_offset r12, -32
-.cfi_offset r13, -40
-.cfi_offset r14, -48
-.cfi_offset r15, -56
+
+
+
+
+
+
+
leaq 32(%rsp),%rbp
andq $-32,%rbp
movq %rdx,8+32(%rbp)
@@ -1834,26 +1834,26 @@ open_sse_finalize:
adcq 8+16(%rbp),%r11
addq $288 + 32,%rsp
-.cfi_adjust_cfa_offset -(288 + 32)
+
popq %r9
-.cfi_adjust_cfa_offset -8
+
movq %r10,(%r9)
movq %r11,8(%r9)
popq %r15
-.cfi_adjust_cfa_offset -8
+
popq %r14
-.cfi_adjust_cfa_offset -8
+
popq %r13
-.cfi_adjust_cfa_offset -8
+
popq %r12
-.cfi_adjust_cfa_offset -8
+
popq %rbx
-.cfi_adjust_cfa_offset -8
+
popq %rbp
-.cfi_adjust_cfa_offset -8
+
.byte 0xf3,0xc3
-.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+
open_sse_128:
movdqu .chacha20_consts(%rip),%xmm0
@@ -2086,7 +2086,7 @@ open_sse_128:
jmp 1b
jmp open_sse_tail_16
-.cfi_endproc
+
@@ -2096,31 +2096,31 @@ open_sse_128:
.p2align 6
_chacha20_poly1305_seal:
-.cfi_startproc
+
pushq %rbp
-.cfi_adjust_cfa_offset 8
+
pushq %rbx
-.cfi_adjust_cfa_offset 8
+
pushq %r12
-.cfi_adjust_cfa_offset 8
+
pushq %r13
-.cfi_adjust_cfa_offset 8
+
pushq %r14
-.cfi_adjust_cfa_offset 8
+
pushq %r15
-.cfi_adjust_cfa_offset 8
+
pushq %r9
-.cfi_adjust_cfa_offset 8
+
subq $288 + 32,%rsp
-.cfi_adjust_cfa_offset 288 + 32
-.cfi_offset rbp, -16
-.cfi_offset rbx, -24
-.cfi_offset r12, -32
-.cfi_offset r13, -40
-.cfi_offset r14, -48
-.cfi_offset r15, -56
+
+
+
+
+
+
+
leaq 32(%rsp),%rbp
andq $-32,%rbp
movq %rdx,8+32(%rbp)
@@ -3717,26 +3717,26 @@ seal_sse_finalize:
adcq 8+16(%rbp),%r11
addq $288 + 32,%rsp
-.cfi_adjust_cfa_offset -(288 + 32)
+
popq %r9
-.cfi_adjust_cfa_offset -8
+
movq %r10,0(%r9)
movq %r11,8(%r9)
popq %r15
-.cfi_adjust_cfa_offset -8
+
popq %r14
-.cfi_adjust_cfa_offset -8
+
popq %r13
-.cfi_adjust_cfa_offset -8
+
popq %r12
-.cfi_adjust_cfa_offset -8
+
popq %rbx
-.cfi_adjust_cfa_offset -8
+
popq %rbp
-.cfi_adjust_cfa_offset -8
+
.byte 0xf3,0xc3
-.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+
seal_sse_128:
movdqu .chacha20_consts(%rip),%xmm0
@@ -8783,5 +8783,5 @@ seal_avx2_short_tail:
1:
vzeroupper
jmp seal_sse_tail_16
-.cfi_endproc
+
#endif
diff --git a/mac-x86_64/crypto/modes/ghash-x86_64.S b/mac-x86_64/crypto/modes/ghash-x86_64.S
index 334f83ff..814d7961 100644
--- a/mac-x86_64/crypto/modes/ghash-x86_64.S
+++ b/mac-x86_64/crypto/modes/ghash-x86_64.S
@@ -10,6 +10,10 @@ _gcm_gmult_4bit:
pushq %rbx
pushq %rbp
pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $280,%rsp
L$gmult_prologue:
movzbq 15(%rdi),%r8
@@ -86,8 +90,9 @@ L$break1:
movq %r8,8(%rdi)
movq %r9,(%rdi)
- movq 16(%rsp),%rbx
- leaq 24(%rsp),%rsp
+ leaq 280+48(%rsp),%rsi
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$gmult_epilogue:
.byte 0xf3,0xc3
@@ -647,14 +652,14 @@ L$outer_loop:
movq %r8,8(%rdi)
movq %r9,(%rdi)
- leaq 280(%rsp),%rsi
- movq 0(%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ leaq 280+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq 0(%rsi),%rsp
L$ghash_epilogue:
.byte 0xf3,0xc3
diff --git a/mac-x86_64/crypto/sha/sha1-x86_64.S b/mac-x86_64/crypto/sha/sha1-x86_64.S
index 0509d451..cf45d8ab 100644
--- a/mac-x86_64/crypto/sha/sha1-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha1-x86_64.S
@@ -1240,14 +1240,13 @@ L$epilogue:
.p2align 4
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- movq %rsp,%rax
+ movq %rsp,%r11
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
leaq -64(%rsp),%rsp
- movq %rax,%r14
andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
@@ -1255,7 +1254,7 @@ _ssse3_shortcut:
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -1267,8 +1266,8 @@ _ssse3_shortcut:
xorl %edx,%edi
andl %edi,%esi
- movdqa 64(%r11),%xmm6
- movdqa -64(%r11),%xmm9
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -1344,7 +1343,7 @@ L$oop_ssse3:
pslld $2,%xmm9
pxor %xmm10,%xmm4
xorl %ebp,%edx
- movdqa -64(%r11),%xmm10
+ movdqa -64(%r14),%xmm10
roll $5,%ecx
addl %edi,%ebx
andl %edx,%esi
@@ -1405,7 +1404,7 @@ L$oop_ssse3:
pslld $2,%xmm10
pxor %xmm8,%xmm5
xorl %eax,%ebp
- movdqa -32(%r11),%xmm8
+ movdqa -32(%r14),%xmm8
roll $5,%edx
addl %edi,%ecx
andl %ebp,%esi
@@ -1466,7 +1465,7 @@ L$oop_ssse3:
pslld $2,%xmm8
pxor %xmm9,%xmm6
xorl %ebx,%eax
- movdqa -32(%r11),%xmm9
+ movdqa -32(%r14),%xmm9
roll $5,%ebp
addl %edi,%edx
andl %eax,%esi
@@ -1527,7 +1526,7 @@ L$oop_ssse3:
pslld $2,%xmm9
pxor %xmm10,%xmm7
xorl %ecx,%ebx
- movdqa -32(%r11),%xmm10
+ movdqa -32(%r14),%xmm10
roll $5,%eax
addl %edi,%ebp
andl %ebx,%esi
@@ -1638,7 +1637,7 @@ L$oop_ssse3:
pxor %xmm3,%xmm2
addl %esi,%eax
xorl %edx,%edi
- movdqa 0(%r11),%xmm10
+ movdqa 0(%r14),%xmm10
rorl $7,%ecx
paddd %xmm1,%xmm9
addl %ebx,%eax
@@ -1873,7 +1872,7 @@ L$oop_ssse3:
pxor %xmm0,%xmm7
roll $5,%ebx
addl %esi,%eax
- movdqa 32(%r11),%xmm9
+ movdqa 32(%r14),%xmm9
xorl %ecx,%edi
paddd %xmm6,%xmm8
xorl %edx,%ecx
@@ -2164,8 +2163,8 @@ L$oop_ssse3:
addl %edx,%ecx
cmpq %r10,%r9
je L$done_ssse3
- movdqa 64(%r11),%xmm6
- movdqa -64(%r11),%xmm9
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
movdqu 0(%r9),%xmm0
movdqu 16(%r9),%xmm1
movdqu 32(%r9),%xmm2
@@ -2402,13 +2401,12 @@ L$done_ssse3:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+ movq -32(%r11),%r13
+ movq -24(%r11),%r12
+ movq -16(%r11),%rbp
+ movq -8(%r11),%rbx
+ leaq (%r11),%rsp
L$epilogue_ssse3:
.byte 0xf3,0xc3
@@ -2416,7 +2414,7 @@ L$epilogue_ssse3:
.p2align 4
sha1_block_data_order_avx:
_avx_shortcut:
- movq %rsp,%rax
+ movq %rsp,%r11
pushq %rbx
pushq %rbp
pushq %r12
@@ -2424,7 +2422,6 @@ _avx_shortcut:
pushq %r14
leaq -64(%rsp),%rsp
vzeroupper
- movq %rax,%r14
andq $-64,%rsp
movq %rdi,%r8
movq %rsi,%r9
@@ -2432,7 +2429,7 @@ _avx_shortcut:
shlq $6,%r10
addq %r9,%r10
- leaq K_XX_XX+64(%rip),%r11
+ leaq K_XX_XX+64(%rip),%r14
movl 0(%r8),%eax
movl 4(%r8),%ebx
@@ -2444,8 +2441,8 @@ _avx_shortcut:
xorl %edx,%edi
andl %edi,%esi
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
vmovdqu 0(%r9),%xmm0
vmovdqu 16(%r9),%xmm1
vmovdqu 32(%r9),%xmm2
@@ -2570,7 +2567,7 @@ L$oop_avx:
vpxor %xmm10,%xmm5,%xmm5
xorl %eax,%ebp
shldl $5,%edx,%edx
- vmovdqa -32(%r11),%xmm11
+ vmovdqa -32(%r14),%xmm11
addl %edi,%ecx
andl %ebp,%esi
xorl %eax,%ebp
@@ -2783,7 +2780,7 @@ L$oop_avx:
addl %esi,%eax
xorl %edx,%edi
vpaddd %xmm1,%xmm11,%xmm9
- vmovdqa 0(%r11),%xmm11
+ vmovdqa 0(%r14),%xmm11
shrdl $7,%ecx,%ecx
addl %ebx,%eax
vpxor %xmm8,%xmm2,%xmm2
@@ -3002,7 +2999,7 @@ L$oop_avx:
movl %ebx,%edi
xorl %edx,%esi
vpaddd %xmm6,%xmm11,%xmm9
- vmovdqa 32(%r11),%xmm11
+ vmovdqa 32(%r14),%xmm11
shldl $5,%ebx,%ebx
addl %esi,%eax
vpxor %xmm8,%xmm7,%xmm7
@@ -3281,8 +3278,8 @@ L$oop_avx:
addl %edx,%ecx
cmpq %r10,%r9
je L$done_avx
- vmovdqa 64(%r11),%xmm6
- vmovdqa -64(%r11),%xmm11
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
vmovdqu 0(%r9),%xmm0
vmovdqu 16(%r9),%xmm1
vmovdqu 32(%r9),%xmm2
@@ -3518,13 +3515,12 @@ L$done_avx:
movl %ecx,8(%r8)
movl %edx,12(%r8)
movl %ebp,16(%r8)
- leaq (%r14),%rsi
- movq -40(%rsi),%r14
- movq -32(%rsi),%r13
- movq -24(%rsi),%r12
- movq -16(%rsi),%rbp
- movq -8(%rsi),%rbx
- leaq (%rsi),%rsp
+ movq -40(%r11),%r14
+ movq -32(%r11),%r13
+ movq -24(%r11),%r12
+ movq -16(%r11),%rbp
+ movq -8(%r11),%rbx
+ leaq (%r11),%rsp
L$epilogue_avx:
.byte 0xf3,0xc3
diff --git a/mac-x86_64/crypto/sha/sha256-x86_64.S b/mac-x86_64/crypto/sha/sha256-x86_64.S
index 0146ff5c..f00ef6da 100644
--- a/mac-x86_64/crypto/sha/sha256-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha256-x86_64.S
@@ -18,13 +18,13 @@ _sha256_block_data_order:
je L$avx_shortcut
testl $512,%r10d
jnz L$ssse3_shortcut
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $64+32,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -32,7 +32,7 @@ _sha256_block_data_order:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,64+24(%rsp)
L$prologue:
movl 0(%rdi),%eax
@@ -1697,13 +1697,13 @@ L$rounds_16_xx:
jb L$loop
movq 64+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue:
.byte 0xf3,0xc3
@@ -1754,13 +1754,13 @@ K256:
.p2align 6
sha256_block_data_order_ssse3:
L$ssse3_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -1768,7 +1768,7 @@ L$ssse3_shortcut:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,64+24(%rsp)
L$prologue_ssse3:
movl 0(%rdi),%eax
@@ -2835,13 +2835,13 @@ L$ssse3_00_47:
jb L$loop_ssse3
movq 64+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue_ssse3:
.byte 0xf3,0xc3
@@ -2849,13 +2849,13 @@ L$epilogue_ssse3:
.p2align 6
sha256_block_data_order_avx:
L$avx_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -2863,7 +2863,7 @@ L$avx_shortcut:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %r11,64+24(%rsp)
+ movq %rax,64+24(%rsp)
L$prologue_avx:
vzeroupper
@@ -3892,13 +3892,13 @@ L$avx_00_47:
movq 64+24(%rsp),%rsi
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue_avx:
.byte 0xf3,0xc3
diff --git a/mac-x86_64/crypto/sha/sha512-x86_64.S b/mac-x86_64/crypto/sha/sha512-x86_64.S
index aeabd3f4..eabcb3af 100644
--- a/mac-x86_64/crypto/sha/sha512-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha512-x86_64.S
@@ -18,13 +18,13 @@ _sha512_block_data_order:
orl %r9d,%r10d
cmpl $1342177792,%r10d
je L$avx_shortcut
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $128+32,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -32,7 +32,7 @@ _sha512_block_data_order:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,128+24(%rsp)
L$prologue:
movq 0(%rdi),%rax
@@ -1697,13 +1697,13 @@ L$rounds_16_xx:
jb L$loop
movq 128+24(%rsp),%rsi
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue:
.byte 0xf3,0xc3
@@ -1798,13 +1798,13 @@ K512:
.p2align 6
sha512_block_data_order_xop:
L$xop_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -1812,7 +1812,7 @@ L$xop_shortcut:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,128+24(%rsp)
L$prologue_xop:
vzeroupper
@@ -2867,13 +2867,13 @@ L$xop_00_47:
movq 128+24(%rsp),%rsi
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue_xop:
.byte 0xf3,0xc3
@@ -2881,13 +2881,13 @@ L$epilogue_xop:
.p2align 6
sha512_block_data_order_avx:
L$avx_shortcut:
+ movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
- movq %rsp,%r11
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -2895,7 +2895,7 @@ L$avx_shortcut:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %r11,128+24(%rsp)
+ movq %rax,128+24(%rsp)
L$prologue_avx:
vzeroupper
@@ -4014,13 +4014,13 @@ L$avx_00_47:
movq 128+24(%rsp),%rsi
vzeroupper
- movq (%rsi),%r15
- movq 8(%rsi),%r14
- movq 16(%rsi),%r13
- movq 24(%rsi),%r12
- movq 32(%rsi),%rbp
- movq 40(%rsi),%rbx
- leaq 48(%rsi),%rsp
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
L$epilogue_avx:
.byte 0xf3,0xc3
diff --git a/sources.bp b/sources.bp
index 8e13f1af..6da6257c 100644
--- a/sources.bp
+++ b/sources.bp
@@ -51,6 +51,7 @@ cc_defaults {
"src/crypto/asn1/tasn_new.c",
"src/crypto/asn1/tasn_typ.c",
"src/crypto/asn1/tasn_utl.c",
+ "src/crypto/asn1/time_support.c",
"src/crypto/asn1/x_bignum.c",
"src/crypto/asn1/x_long.c",
"src/crypto/base64/base64.c",
@@ -202,7 +203,6 @@ cc_defaults {
"src/crypto/thread_none.c",
"src/crypto/thread_pthread.c",
"src/crypto/thread_win.c",
- "src/crypto/time_support.c",
"src/crypto/x509/a_digest.c",
"src/crypto/x509/a_sign.c",
"src/crypto/x509/a_strex.c",
@@ -466,8 +466,12 @@ cc_defaults {
cc_defaults {
name: "boringssl_crypto_test_sources",
srcs: [
+ "src/crypto/chacha/chacha_test.cc",
"src/crypto/dh/dh_test.cc",
"src/crypto/dsa/dsa_test.cc",
+ "src/crypto/ec/ec_test.cc",
+ "src/crypto/err/err_test.cc",
+ "src/crypto/rsa/rsa_test.cc",
"src/crypto/test/gtest_main.cc",
],
}
@@ -489,7 +493,6 @@ cc_defaults {
"src/crypto/bio/bio_test.cc",
"src/crypto/bn/bn_test.cc",
"src/crypto/bytestring/bytestring_test.cc",
- "src/crypto/chacha/chacha_test.cc",
"src/crypto/cipher/aead_test.cc",
"src/crypto/cipher/cipher_test.cc",
"src/crypto/cmac/cmac_test.cc",
@@ -498,14 +501,12 @@ cc_defaults {
"src/crypto/curve25519/spake25519_test.cc",
"src/crypto/curve25519/x25519_test.cc",
"src/crypto/digest/digest_test.cc",
- "src/crypto/ec/ec_test.cc",
"src/crypto/ec/example_mul.c",
"src/crypto/ec/p256-x86_64_test.cc",
"src/crypto/ecdh/ecdh_test.cc",
"src/crypto/ecdsa/ecdsa_sign_test.cc",
"src/crypto/ecdsa/ecdsa_test.cc",
"src/crypto/ecdsa/ecdsa_verify_test.cc",
- "src/crypto/err/err_test.cc",
"src/crypto/evp/evp_extra_test.cc",
"src/crypto/evp/evp_test.cc",
"src/crypto/evp/pbkdf_test.cc",
@@ -519,7 +520,6 @@ cc_defaults {
"src/crypto/poly1305/poly1305_test.cc",
"src/crypto/pool/pool_test.cc",
"src/crypto/refcount_test.cc",
- "src/crypto/rsa/rsa_test.cc",
"src/crypto/thread_test.c",
"src/crypto/x509/pkcs7_test.c",
"src/crypto/x509/x509_test.cc",
diff --git a/sources.mk b/sources.mk
index ebea6b9d..2c9cfa2e 100644
--- a/sources.mk
+++ b/sources.mk
@@ -49,6 +49,7 @@ crypto_sources := \
src/crypto/asn1/tasn_new.c\
src/crypto/asn1/tasn_typ.c\
src/crypto/asn1/tasn_utl.c\
+ src/crypto/asn1/time_support.c\
src/crypto/asn1/x_bignum.c\
src/crypto/asn1/x_long.c\
src/crypto/base64/base64.c\
@@ -200,7 +201,6 @@ crypto_sources := \
src/crypto/thread_none.c\
src/crypto/thread_pthread.c\
src/crypto/thread_win.c\
- src/crypto/time_support.c\
src/crypto/x509/a_digest.c\
src/crypto/x509/a_sign.c\
src/crypto/x509/a_strex.c\
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e15df7a5..2abf6166 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -240,10 +240,6 @@ endif()
# googletest has a very straightforward build.
add_library(gtest third_party/googletest/src/gtest-all.cc)
target_include_directories(gtest PRIVATE third_party/googletest)
-if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
- # TODO(davidben): Make googletest pass -Wmissing-declarations.
- set_target_properties(gtest PROPERTIES COMPILE_FLAGS "-Wno-missing-declarations")
-endif()
include_directories(third_party/googletest/include)
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index bbc68d00..fbfc4b27 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -129,7 +129,6 @@ add_library(
thread_none.c
thread_pthread.c
thread_win.c
- time_support.c
$<TARGET_OBJECTS:stack>
$<TARGET_OBJECTS:lhash>
@@ -212,9 +211,12 @@ add_dependencies(all_tests refcount_test)
add_executable(
crypto_test
+ chacha/chacha_test.cc
dh/dh_test.cc
dsa/dsa_test.cc
+ ec/ec_test.cc
err/err_test.cc
+ rsa/rsa_test.cc
$<TARGET_OBJECTS:gtest_main>
$<TARGET_OBJECTS:test_support>
diff --git a/src/crypto/aes/asm/aes-586.pl b/src/crypto/aes/asm/aes-586.pl
index 9e6e1cc0..45c19fb1 100755
--- a/src/crypto/aes/asm/aes-586.pl
+++ b/src/crypto/aes/asm/aes-586.pl
@@ -116,7 +116,7 @@
# words every cache-line is *guaranteed* to be accessed within ~50
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)
+# part is that it has no negative effect on performance:-)
#
# Version 4.3 implements switch between compact and non-compact block
# functions in AES_cbc_encrypt depending on how much data was asked
@@ -578,7 +578,7 @@ sub enctransform()
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | mm4 | mm0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# | s3 | s2 | s1 | s0 |
+# | s3 | s2 | s1 | s0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
@@ -798,7 +798,7 @@ sub encstep()
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
- else { &mov ($tmp,$s[3]);
+ else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
&xor ($out,&DWP(1,$te,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
@@ -1551,7 +1551,7 @@ sub sse_deccompact()
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
- &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
@@ -2021,7 +2021,7 @@ sub declast()
{
# stack frame layout
# -4(%esp) # return address 0(%esp)
-# 0(%esp) # s0 backing store 4(%esp)
+# 0(%esp) # s0 backing store 4(%esp)
# 4(%esp) # s1 backing store 8(%esp)
# 8(%esp) # s2 backing store 12(%esp)
# 12(%esp) # s3 backing store 16(%esp)
@@ -2731,7 +2731,7 @@ sub enckey()
&mov (&DWP(80,"edi"),10); # setup number of rounds
&xor ("eax","eax");
&jmp (&label("exit"));
-
+
&set_label("12rounds");
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
&mov ("ebx",&DWP(4,"esi"));
diff --git a/src/crypto/aes/asm/aes-x86_64.pl b/src/crypto/aes/asm/aes-x86_64.pl
index ed489af1..abf957cc 100644..100755
--- a/src/crypto/aes/asm/aes-x86_64.pl
+++ b/src/crypto/aes/asm/aes-x86_64.pl
@@ -590,6 +590,7 @@ $code.=<<___;
.type asm_AES_encrypt,\@function,3
.hidden asm_AES_encrypt
asm_AES_encrypt:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -598,7 +599,6 @@ asm_AES_encrypt:
push %r15
# allocate frame "above" key schedule
- mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
@@ -608,7 +608,7 @@ asm_AES_encrypt:
sub \$32,%rsp
mov %rsi,16(%rsp) # save out
- mov %r10,24(%rsp) # save real stack pointer
+ mov %rax,24(%rsp) # save original stack pointer
.Lenc_prologue:
mov %rdx,$key
@@ -640,13 +640,13 @@ asm_AES_encrypt:
mov $s2,8($out)
mov $s3,12($out)
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lenc_epilogue:
ret
.size asm_AES_encrypt,.-asm_AES_encrypt
@@ -1186,6 +1186,7 @@ $code.=<<___;
.type asm_AES_decrypt,\@function,3
.hidden asm_AES_decrypt
asm_AES_decrypt:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
@@ -1194,7 +1195,6 @@ asm_AES_decrypt:
push %r15
# allocate frame "above" key schedule
- mov %rsp,%r10
lea -63(%rdx),%rcx # %rdx is key argument
and \$-64,%rsp
sub %rsp,%rcx
@@ -1204,7 +1204,7 @@ asm_AES_decrypt:
sub \$32,%rsp
mov %rsi,16(%rsp) # save out
- mov %r10,24(%rsp) # save real stack pointer
+ mov %rax,24(%rsp) # save original stack pointer
.Ldec_prologue:
mov %rdx,$key
@@ -1238,13 +1238,13 @@ asm_AES_decrypt:
mov $s2,8($out)
mov $s3,12($out)
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Ldec_epilogue:
ret
.size asm_AES_decrypt,.-asm_AES_decrypt
@@ -1286,7 +1286,7 @@ $code.=<<___;
asm_AES_set_encrypt_key:
push %rbx
push %rbp
- push %r12 # redundant, but allows to share
+ push %r12 # redundant, but allows to share
push %r13 # exception handler...
push %r14
push %r15
@@ -1412,7 +1412,7 @@ $code.=<<___;
xor %rax,%rax
jmp .Lexit
-.L14rounds:
+.L14rounds:
mov 0(%rsi),%rax # copy first 8 dwords
mov 8(%rsi),%rbx
mov 16(%rsi),%rcx
@@ -1660,10 +1660,9 @@ asm_AES_cbc_encrypt:
mov %r9d,%r9d # clear upper half of enc
lea .LAES_Te(%rip),$sbox
+ lea .LAES_Td(%rip),%r10
cmp \$0,%r9
- jne .Lcbc_picked_te
- lea .LAES_Td(%rip),$sbox
-.Lcbc_picked_te:
+ cmoveq %r10,$sbox
mov OPENSSL_ia32cap_P(%rip),%r10d
cmp \$$speed_limit,%rdx
@@ -2565,7 +2564,6 @@ block_se_handler:
jae .Lin_block_prologue
mov 24(%rax),%rax # pull saved real stack pointer
- lea 48(%rax),%rax # adjust...
mov -8(%rax),%rbx
mov -16(%rax),%rbp
diff --git a/src/crypto/aes/asm/aesni-x86.pl b/src/crypto/aes/asm/aesni-x86.pl
index 4ef84bc2..e494dd16 100644
--- a/src/crypto/aes/asm/aesni-x86.pl
+++ b/src/crypto/aes/asm/aesni-x86.pl
@@ -51,7 +51,9 @@
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91
# Haswell 4.44/0.80 0.97 1.03 0.72
+# Skylake 2.68/0.65 0.65 0.66 0.64
# Silvermont 5.77/3.56 3.67 4.03 3.46
+# Goldmont 3.84/1.39 1.39 1.63 1.31
# Bulldozer 5.80/0.98 1.05 1.24 0.93
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
@@ -1040,7 +1042,7 @@ if ($PREFIX eq "aesni") {
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
-
+
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }
diff --git a/src/crypto/aes/asm/aesni-x86_64.pl b/src/crypto/aes/asm/aesni-x86_64.pl
index 55d5f30a..8ae6dbfa 100644
--- a/src/crypto/aes/asm/aesni-x86_64.pl
+++ b/src/crypto/aes/asm/aesni-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,7 +34,7 @@
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
-# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
@@ -111,7 +118,7 @@
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
-# instructions' interleave factor. Westmere can execute at most 3
+# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
@@ -157,16 +164,23 @@
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
-# CBC en-/decrypt CTR XTS ECB
+# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.25 1.25 1.25 1.26
-# * Bridge 5.07/0.74 0.75 0.90 0.85
-# Haswell 4.44/0.63 0.63 0.73 0.63
-# Silvermont 5.75/3.54 3.56 4.12 3.87(*)
-# Bulldozer 5.77/0.70 0.72 0.90 0.70
+# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
+# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
+# Skylake 2.62/0.63 0.63 0.63 0.63
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
+# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
#
# (*) Atom Silvermont ECB result is suboptimal because of penalties
# incurred by operations on %xmm8-15. As ECB is not considered
@@ -299,7 +313,7 @@ ___
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
-# * Bridge and "super-optimal" for other Intel CPUs...
+# * Bridge and "super-optimal" for other Intel CPUs...
sub aesni_generate2 {
my $dir=shift;
@@ -1158,7 +1172,7 @@ ___
# with zero-round key xor.
{
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
-my ($key0,$ctr)=("${key_}d","${ivp}d");
+my ($key0,$ctr)=("%ebp","${ivp}d");
my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
@@ -1187,26 +1201,25 @@ $code.=<<___;
.align 16
.Lctr32_bulk:
- lea (%rsp),%rax
+ lea (%rsp),$key_ # use $key_ as frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8($key_) # offload everything
+ movaps %xmm7,-0x98($key_)
+ movaps %xmm8,-0x88($key_)
+ movaps %xmm9,-0x78($key_)
+ movaps %xmm10,-0x68($key_)
+ movaps %xmm11,-0x58($key_)
+ movaps %xmm12,-0x48($key_)
+ movaps %xmm13,-0x38($key_)
+ movaps %xmm14,-0x28($key_)
+ movaps %xmm15,-0x18($key_)
.Lctr32_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
# 8 16-byte words on top of stack are counter values
# xor-ed with zero-round key
@@ -1258,7 +1271,7 @@ $code.=<<___;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
- mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
@@ -1538,7 +1551,7 @@ $code.=<<___;
.Lctr32_tail:
# note that at this point $inout0..5 are populated with
- # counter values xor-ed with 0-round key
+ # counter values xor-ed with 0-round key
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3
@@ -1678,26 +1691,26 @@ $code.=<<___ if (!$win64);
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8($key_),%xmm6
+ movaps %xmm0,-0xa8($key_) # clear stack
+ movaps -0x98($key_),%xmm7
+ movaps %xmm0,-0x98($key_)
+ movaps -0x88($key_),%xmm8
+ movaps %xmm0,-0x88($key_)
+ movaps -0x78($key_),%xmm9
+ movaps %xmm0,-0x78($key_)
+ movaps -0x68($key_),%xmm10
+ movaps %xmm0,-0x68($key_)
+ movaps -0x58($key_),%xmm11
+ movaps %xmm0,-0x58($key_)
+ movaps -0x48($key_),%xmm12
+ movaps %xmm0,-0x48($key_)
+ movaps -0x38($key_),%xmm13
+ movaps %xmm0,-0x38($key_)
+ movaps -0x28($key_),%xmm14
+ movaps %xmm0,-0x28($key_)
+ movaps -0x18($key_),%xmm15
+ movaps %xmm0,-0x18($key_)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
@@ -1708,8 +1721,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x70(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8($key_),%rbp
+ lea ($key_),%rsp
.Lctr32_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
@@ -1726,32 +1739,32 @@ my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp"; # override so that we can use %r11 as FP
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_enc_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
movups ($ivp),$inout0 # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
@@ -2169,26 +2182,26 @@ $code.=<<___ if (!$win64);
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
@@ -2198,8 +2211,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+ lea (%r11),%rsp
.Lxts_enc_epilogue:
ret
.size aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -2210,26 +2223,25 @@ $code.=<<___;
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_dec_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
movups ($ivp),$inout0 # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
@@ -2673,26 +2685,26 @@ $code.=<<___ if (!$win64);
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
@@ -2702,12 +2714,933 @@ $code.=<<___ if ($win64);
movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+ lea (%r11),%rsp
.Lxts_dec_epilogue:
ret
.size aesni_xts_decrypt,.-aesni_xts_decrypt
___
+}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+ lea (%rsp),%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_enc_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_encrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ add \$6,$blocks
+ jz .Locb_enc_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_enc_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_enc_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_enc_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_enc_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_encrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+
+.Locb_enc_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Locb_enc_epilogue:
+ ret
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor $inout4,$checksum
+ pxor @offset[4],$inout4
+ pxor $inout5,$checksum
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesenc $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesenclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ aesenclast @offset[4],$inout4
+ aesenclast @offset[5],$inout5
+ ret
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop4
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast @offset[0],$inout0
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ ret
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesenc $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesenc $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+ aesenc $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop1
+
+ aesenc $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast $inout5,$inout0
+ ret
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+ lea (%rsp),%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_dec_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ xorps $inout0,$checksum # accumulate checksum
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_decrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+ movups $inout5,`16*5`($out)
+ pxor $inout5,$checksum
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ add \$6,$blocks
+ jz .Locb_dec_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_dec_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_dec_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_dec_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_dec_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_decrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ xorps $inout2,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+
+.Locb_dec_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+ mov -32(%rax),%r13
+ mov -24(%rax),%r12
+ mov -16(%rax),%rbp
+ mov -8(%rax),%rbx
+ lea (%rax),%rsp
+.Locb_dec_epilogue:
+ ret
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor @offset[4],$inout4
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop6
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesdeclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ aesdeclast @offset[4],$inout4
+ aesdeclast @offset[5],$inout5
+ ret
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop4
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast @offset[0],$inout0
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ ret
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesdec $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesdec $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+ aesdec $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop1
+
+ aesdec $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast $inout5,$inout0
+ ret
+.size __ocb_decrypt1,.-__ocb_decrypt1
+___
} }}
########################################################################
@@ -2717,7 +3650,6 @@ ___
{
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
-my $inp_=$key_;
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
@@ -2799,7 +3731,7 @@ $code.=<<___;
jmp .Lcbc_ret
.align 16
.Lcbc_decrypt_bulk:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
@@ -2817,8 +3749,11 @@ $code.=<<___ if ($win64);
movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
+
+my $inp_=$key_="%rbp"; # reassign $key_
+
$code.=<<___;
- lea -8(%rax),%rbp
+ mov $key,$key_ # [re-]backup $key [after reassignment]
movups ($ivp),$iv
mov $rnds_,$rounds
cmp \$0x50,$len
@@ -2858,7 +3793,7 @@ $code.=<<___;
pxor $rndkey0,$inout1
$movkey 0x10-0x70($key),$rndkey1
pxor $rndkey0,$inout2
- xor $inp_,$inp_
+ mov \$-1,$inp_
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
pxor $rndkey0,$inout3
pxor $rndkey0,$inout4
@@ -2874,8 +3809,8 @@ $code.=<<___;
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
aesdec $rndkey1,$inout6
- setnc ${inp_}b
- shl \$7,$inp_
+ adc \$0,$inp_
+ and \$128,$inp_
aesdec $rndkey1,$inout7
add $inp,$inp_
$movkey 0x30-0x70($key),$rndkey1
@@ -3239,8 +4174,8 @@ $code.=<<___ if ($win64);
movaps %xmm0,0xa0(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+ lea (%r11),%rsp
.Lcbc_ret:
ret
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
@@ -3307,7 +4242,7 @@ ___
# Vinodh Gopal <vinodh.gopal@intel.com>
# Kahraman Akdemir
#
-# Agressively optimized in respect to aeskeygenassist's critical path
+# Aggressively optimized in respect to aeskeygenassist's critical path
# and is contained in %xmm0-5 to meet Win64 ABI requirement.
#
# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
@@ -3811,14 +4746,76 @@ ctr_xts_se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 160($context),%rax # pull context->Rbp
- lea -0xa0(%rax),%rsi # %xmm save area
+ mov 208($context),%rax # pull context->R11
+
+ lea -0xa8(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- jmp .Lcommon_rbp_tail
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
+ jmp .Lcommon_seh_tail
.size ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10
+ cmp %r10,%rbx # context->Rip>=pop label
+ jae .Locb_no_xmm
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea (%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size ocb_se_handler,.-ocb_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
@@ -3842,9 +4839,13 @@ cbc_se_handler:
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
+ mov 120($context),%rax # pull context->Rax
+
lea .Lcbc_decrypt_body(%rip),%r10
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
- jb .Lrestore_cbc_rax
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
lea .Lcbc_ret(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
@@ -3855,15 +4856,10 @@ cbc_se_handler:
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
-.Lcommon_rbp_tail:
- mov 160($context),%rax # pull context->Rbp
- mov (%rax),%rbp # restore saved %rbp
- lea 8(%rax),%rax # adjust stack pointer
- mov %rbp,160($context) # restore context->Rbp
- jmp .Lcommon_seh_tail
+ mov 208($context),%rax # pull context->R11
-.Lrestore_cbc_rax:
- mov 120($context),%rax
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -3932,6 +4928,14 @@ $code.=<<___ if ($PREFIX eq "aesni");
.rva .LSEH_begin_aesni_xts_decrypt
.rva .LSEH_end_aesni_xts_decrypt
.rva .LSEH_info_xts_dec
+
+ .rva .LSEH_begin_aesni_ocb_encrypt
+ .rva .LSEH_end_aesni_ocb_encrypt
+ .rva .LSEH_info_ocb_enc
+
+ .rva .LSEH_begin_aesni_ocb_decrypt
+ .rva .LSEH_end_aesni_ocb_decrypt
+ .rva .LSEH_info_ocb_dec
___
$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
@@ -3973,6 +4977,18 @@ $code.=<<___ if ($PREFIX eq "aesni");
.byte 9,0,0,0
.rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+.LSEH_info_ocb_enc:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
+ .rva .Locb_enc_pop
+ .long 0
+.LSEH_info_ocb_dec:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
+ .rva .Locb_dec_pop
+ .long 0
___
$code.=<<___;
.LSEH_info_cbc:
diff --git a/src/crypto/aes/asm/aesv8-armx.pl b/src/crypto/aes/asm/aesv8-armx.pl
index f6d0dabd..23ed77c1 100644
--- a/src/crypto/aes/asm/aesv8-armx.pl
+++ b/src/crypto/aes/asm/aesv8-armx.pl
@@ -957,21 +957,21 @@ if ($flavour =~ /64/) { ######## 64-bit code
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
- "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
- sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
- sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
diff --git a/src/crypto/aes/asm/bsaes-armv7.pl b/src/crypto/aes/asm/bsaes-armv7.pl
index 37613e2c..d645de4c 100644
--- a/src/crypto/aes/asm/bsaes-armv7.pl
+++ b/src/crypto/aes/asm/bsaes-armv7.pl
@@ -84,7 +84,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]
diff --git a/src/crypto/aes/asm/bsaes-x86_64.pl b/src/crypto/aes/asm/bsaes-x86_64.pl
index 8258f2f4..9a8055ef 100644
--- a/src/crypto/aes/asm/bsaes-x86_64.pl
+++ b/src/crypto/aes/asm/bsaes-x86_64.pl
@@ -41,6 +41,7 @@
# Nehalem(**) 7.63 6.88 +11%
# Atom 17.1 16.4 +4%
# Silvermont - 12.9
+# Goldmont - 8.85
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
@@ -80,6 +81,7 @@
# Nehalem 7.80
# Atom 17.9
# Silvermont 14.0
+# Goldmont 10.2
#
# November 2011.
#
@@ -122,7 +124,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
@@ -372,7 +374,7 @@ $code.=<<___;
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
- pxor @s[3], @t[0]
+ pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
@@ -1325,7 +1327,7 @@ $code.=<<___;
cmp %rax, %rbp
jb .Lecb_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1338,17 +1340,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lecb_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lecb_enc_epilogue:
ret
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
@@ -1527,7 +1529,7 @@ $code.=<<___;
cmp %rax, %rbp
jb .Lecb_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1540,17 +1542,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lecb_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lecb_dec_epilogue:
ret
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
@@ -1817,7 +1819,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lcbc_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -1830,17 +1832,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lcbc_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lcbc_dec_epilogue:
ret
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
@@ -2049,7 +2051,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lctr_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2062,17 +2064,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lctr_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lctr_enc_epilogue:
ret
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
@@ -2439,7 +2441,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lxts_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2452,17 +2454,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lxts_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lxts_enc_epilogue:
ret
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
@@ -2846,7 +2848,7 @@ $code.=<<___;
cmp %rax, %rbp
ja .Lxts_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
@@ -2859,17 +2861,17 @@ $code.=<<___ if ($win64);
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lxts_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+ mov -40(%rax), %r14
+ mov -32(%rax), %r13
+ mov -24(%rax), %r12
+ mov -16(%rax), %rbx
+ mov -8(%rax), %rbp
+ lea (%rax), %rsp # restore %rsp
.Lxts_dec_epilogue:
ret
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
@@ -2965,31 +2967,34 @@ se_handler:
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lin_prologue
-
- mov 152($context),%rax # pull context->Rsp
+ cmp %r10,%rbx # context->Rip<=prologue label
+ jbe .Lin_prologue
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=tail label
+ jae .Lin_tail
+
mov 160($context),%rax # pull context->Rbp
lea 0x40(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0xa0(%rax),%rax # adjust stack pointer
-
- mov 0x70(%rax),%rbp
- mov 0x68(%rax),%rbx
- mov 0x60(%rax),%r12
- mov 0x58(%rax),%r13
- mov 0x50(%rax),%r14
- mov 0x48(%rax),%r15
- lea 0x78(%rax),%rax # adjust stack pointer
+ lea 0xa0+0x78(%rax),%rax # adjust stack pointer
+
+.Lin_tail:
+ mov -48(%rax),%rbp
+ mov -40(%rax),%rbx
+ mov -32(%rax),%r12
+ mov -24(%rax),%r13
+ mov -16(%rax),%r14
+ mov -8(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
@@ -3070,28 +3075,40 @@ $code.=<<___ if ($ecb);
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+ .rva .Lecb_enc_tail
+ .long 0
.Lecb_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+ .rva .Lecb_dec_tail
+ .long 0
___
$code.=<<___;
.Lcbc_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+ .rva .Lcbc_dec_tail
+ .long 0
.Lctr_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+ .rva .Lctr_enc_tail
+ .long 0
.Lxts_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+ .rva .Lxts_enc_tail
+ .long 0
.Lxts_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+ .rva .Lxts_dec_tail
+ .long 0
___
}
diff --git a/src/crypto/aes/asm/vpaes-x86.pl b/src/crypto/aes/asm/vpaes-x86.pl
index 4fcd5615..ebf90e7e 100644
--- a/src/crypto/aes/asm/vpaes-x86.pl
+++ b/src/crypto/aes/asm/vpaes-x86.pl
@@ -438,7 +438,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
- &call ("_vpaes_schedule_transform"); # input transform
+ &call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
@@ -469,7 +469,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
- &call ("_vpaes_schedule_transform"); # input transform
+ &call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
@@ -480,7 +480,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
- &call ("_vpaes_schedule_mangle");
+ &call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
@@ -603,7 +603,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
- &movdqa ("xmm1","xmm4");
+ &movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
diff --git a/src/crypto/aes/asm/vpaes-x86_64.pl b/src/crypto/aes/asm/vpaes-x86_64.pl
index 3f99e368..7a24e0d6 100644
--- a/src/crypto/aes/asm/vpaes-x86_64.pl
+++ b/src/crypto/aes/asm/vpaes-x86_64.pl
@@ -31,6 +31,7 @@
# Nehalem 29.6/40.3/14.6 10.0/11.8
# Atom 57.3/74.2/32.1 60.9/77.2(***)
# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
+# Goldmont 38.9/49.0/17.8 10.6/12.6
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
@@ -164,7 +165,7 @@ _vpaes_encrypt_core:
pshufb %xmm1, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
-
+
##
## Decryption core
##
@@ -325,7 +326,7 @@ _vpaes_schedule_core:
##
.Lschedule_128:
mov \$10, %esi
-
+
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
@@ -359,7 +360,7 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
- palignr \$8,%xmm6,%xmm0
+ palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
@@ -385,7 +386,7 @@ _vpaes_schedule_core:
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
-
+
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
@@ -394,7 +395,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
- call _vpaes_schedule_mangle
+ call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
@@ -402,10 +403,10 @@ _vpaes_schedule_core:
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
-
+
jmp .Loop_schedule_256
-
+
##
## .aes_schedule_mangle_last
##
@@ -504,9 +505,9 @@ _vpaes_schedule_round:
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
-
+
# fall through...
-
+
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
@@ -545,7 +546,7 @@ _vpaes_schedule_low_round:
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
- pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round
diff --git a/src/crypto/asn1/CMakeLists.txt b/src/crypto/asn1/CMakeLists.txt
index 25d8ba22..cd1ee8c2 100644
--- a/src/crypto/asn1/CMakeLists.txt
+++ b/src/crypto/asn1/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(
tasn_new.c
tasn_typ.c
tasn_utl.c
+ time_support.c
x_bignum.c
x_long.c
)
diff --git a/src/crypto/asn1/a_gentm.c b/src/crypto/asn1/a_gentm.c
index 2f298689..d130cdf8 100644
--- a/src/crypto/asn1/a_gentm.c
+++ b/src/crypto/asn1/a_gentm.c
@@ -61,7 +61,6 @@
#include <openssl/err.h>
#include <openssl/mem.h>
-#include <openssl/time_support.h>
#include "asn1_locl.h"
diff --git a/src/crypto/asn1/a_time.c b/src/crypto/asn1/a_time.c
index a12b38ff..4b584297 100644
--- a/src/crypto/asn1/a_time.c
+++ b/src/crypto/asn1/a_time.c
@@ -63,7 +63,6 @@
#include <openssl/buf.h>
#include <openssl/err.h>
#include <openssl/mem.h>
-#include <openssl/time_support.h>
#include "asn1_locl.h"
diff --git a/src/crypto/asn1/a_utctm.c b/src/crypto/asn1/a_utctm.c
index 3b9d2570..193b83f8 100644
--- a/src/crypto/asn1/a_utctm.c
+++ b/src/crypto/asn1/a_utctm.c
@@ -61,7 +61,6 @@
#include <openssl/err.h>
#include <openssl/mem.h>
-#include <openssl/time_support.h>
#include "asn1_locl.h"
diff --git a/src/crypto/asn1/asn1_locl.h b/src/crypto/asn1/asn1_locl.h
index 982bfd60..ce8146bf 100644
--- a/src/crypto/asn1/asn1_locl.h
+++ b/src/crypto/asn1/asn1_locl.h
@@ -57,7 +57,42 @@
*
*/
+#ifndef OPENSSL_HEADER_ASN1_ASN1_LOCL_H
+#define OPENSSL_HEADER_ASN1_ASN1_LOCL_H
+
+#include <time.h>
+
+#include <openssl/asn1.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+/* Wrapper functions for time functions. */
+
+/* OPENSSL_gmtime wraps |gmtime_r|. See the manual page for that function. */
+struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result);
+
+/* OPENSSL_gmtime_adj updates |tm| by adding |offset_day| days and |offset_sec|
+ * seconds. */
+int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec);
+
+/* OPENSSL_gmtime_diff calculates the difference between |from| and |to| and
+ * outputs the difference as a number of days and seconds in |*out_days| and
+ * |*out_secs|. */
+int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from,
+ const struct tm *to);
+
+
/* Internal ASN1 structures and functions: not for application use */
int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d);
int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
+
+
+#if defined(__cplusplus)
+} /* extern C */
+#endif
+
+#endif /* OPENSSL_HEADER_ASN1_ASN1_LOCL_H */
diff --git a/src/crypto/asn1/tasn_dec.c b/src/crypto/asn1/tasn_dec.c
index 40778a84..bf008af1 100644
--- a/src/crypto/asn1/tasn_dec.c
+++ b/src/crypto/asn1/tasn_dec.c
@@ -180,6 +180,7 @@ int ASN1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, long len,
int ret = 0;
ASN1_VALUE **pchptr, *ptmpval;
int combine = aclass & ASN1_TFLG_COMBINE;
+ aclass &= ~ASN1_TFLG_COMBINE;
if (!pval)
return 0;
if (aux && aux->asn1_cb)
@@ -667,6 +668,7 @@ static int asn1_template_noexp_d2i(ASN1_VALUE **val,
}
len -= p - q;
if (!sk_ASN1_VALUE_push((STACK_OF(ASN1_VALUE) *)*val, skfield)) {
+ ASN1_item_ex_free(&skfield, ASN1_ITEM_ptr(tt->item));
OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
goto err;
}
diff --git a/src/crypto/asn1/tasn_new.c b/src/crypto/asn1/tasn_new.c
index 053b732b..10cf954f 100644
--- a/src/crypto/asn1/tasn_new.c
+++ b/src/crypto/asn1/tasn_new.c
@@ -160,7 +160,7 @@ static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it,
}
asn1_set_choice_selector(pval, -1, it);
if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL))
- goto auxerr;
+ goto auxerr2;
break;
case ASN1_ITYPE_NDEF_SEQUENCE:
@@ -188,10 +188,10 @@ static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it,
for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) {
pseqval = asn1_get_field_ptr(pval, tt);
if (!ASN1_template_new(pseqval, tt))
- goto memerr;
+ goto memerr2;
}
if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL))
- goto auxerr;
+ goto auxerr2;
break;
}
#ifdef CRYPTO_MDEBUG
@@ -200,18 +200,20 @@ static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it,
#endif
return 1;
+ memerr2:
+ ASN1_item_ex_free(pval, it);
memerr:
OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
- ASN1_item_ex_free(pval, it);
#ifdef CRYPTO_MDEBUG
if (it->sname)
CRYPTO_pop_info();
#endif
return 0;
+ auxerr2:
+ ASN1_item_ex_free(pval, it);
auxerr:
OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR);
- ASN1_item_ex_free(pval, it);
#ifdef CRYPTO_MDEBUG
if (it->sname)
CRYPTO_pop_info();
diff --git a/src/crypto/time_support.c b/src/crypto/asn1/time_support.c
index ae0f4963..194dc3a7 100644
--- a/src/crypto/time_support.c
+++ b/src/crypto/asn1/time_support.c
@@ -59,7 +59,7 @@
#define _POSIX_C_SOURCE 201410L /* for gmtime_r */
#endif
-#include <openssl/time_support.h>
+#include "asn1_locl.h"
#include <time.h>
diff --git a/src/crypto/bn/asm/armv4-mont.pl b/src/crypto/bn/asm/armv4-mont.pl
index cad59551..d7298d2d 100644
--- a/src/crypto/bn/asm/armv4-mont.pl
+++ b/src/crypto/bn/asm/armv4-mont.pl
@@ -16,7 +16,7 @@
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
-# instructions. The latter means that this code didn't really have an
+# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
diff --git a/src/crypto/bn/asm/bn-586.pl b/src/crypto/bn/asm/bn-586.pl
index 096bb9c9..ccc94519 100644
--- a/src/crypto/bn/asm/bn-586.pl
+++ b/src/crypto/bn/asm/bn-586.pl
@@ -47,7 +47,7 @@ sub bn_mul_add_words
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
-
+
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
@@ -668,20 +668,20 @@ sub bn_sub_part_words
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
-
+
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
-
+
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
-
+
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
@@ -698,9 +698,9 @@ sub bn_sub_part_words
}
&jmp(&label("pw_end"));
-
+
&set_label("pw_pos",0);
-
+
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
@@ -715,18 +715,18 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
-
+
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
-
+
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
-
+
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
@@ -747,17 +747,17 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
-
+
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
-
+
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
-
+
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
diff --git a/src/crypto/bn/asm/co-586.pl b/src/crypto/bn/asm/co-586.pl
index ec3ea343..c63e5622 100644
--- a/src/crypto/bn/asm/co-586.pl
+++ b/src/crypto/bn/asm/co-586.pl
@@ -41,7 +41,7 @@ sub mul_add_c
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
- # is pos > 1, it means it is the last loop
+ # is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
@@ -70,7 +70,7 @@ sub sqr_add_c
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
- # is pos > 1, it means it is the last loop
+ # is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
@@ -121,7 +121,7 @@ sub bn_mul_comba
$c2="ebp";
$a="esi";
$b="edi";
-
+
$as=0;
$ae=0;
$bs=0;
@@ -136,9 +136,9 @@ sub bn_mul_comba
&push("ebx");
&xor($c0,$c0);
- &mov("eax",&DWP(0,$a,"",0)); # load the first word
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
- &mov("edx",&DWP(0,$b,"",0)); # load the first second
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
@@ -146,7 +146,7 @@ sub bn_mul_comba
$bi=$bs;
$end=$be+1;
- &comment("################## Calculate word $i");
+ &comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
diff --git a/src/crypto/bn/asm/rsaz-avx2.pl b/src/crypto/bn/asm/rsaz-avx2.pl
index b8e830e2..5562d691 100755
--- a/src/crypto/bn/asm/rsaz-avx2.pl
+++ b/src/crypto/bn/asm/rsaz-avx2.pl
@@ -145,13 +145,21 @@ $code.=<<___;
.type rsaz_1024_sqr_avx2,\@function,5
.align 64
rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
+.cfi_startproc
lea (%rsp), %rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
vzeroupper
___
$code.=<<___ if ($win64);
@@ -170,6 +178,7 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov %rax,%rbp
+.cfi_def_cfa_register %rbp
mov %rdx, $np # reassigned argument
sub \$$FrameSize, %rsp
mov $np, $tmp
@@ -359,7 +368,7 @@ $code.=<<___;
vpaddq $TEMP1, $ACC1, $ACC1
vpmuludq 32*7-128($aap), $B2, $ACC2
vpbroadcastq 32*5-128($tpa), $B2
- vpaddq 32*11-448($tp1), $ACC2, $ACC2
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
vmovdqu $ACC6, 32*6-192($tp0)
vmovdqu $ACC7, 32*7-192($tp0)
@@ -418,7 +427,7 @@ $code.=<<___;
vmovdqu $ACC7, 32*16-448($tp1)
lea 8($tp1), $tp1
- dec $i
+ dec $i
jnz .LOOP_SQR_1024
___
$ZERO = $ACC9;
@@ -763,7 +772,7 @@ $code.=<<___;
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
vpaddq $TEMP3, $ACC7, $ACC7
vpaddq $TEMP4, $ACC8, $ACC8
-
+
vpsrlq \$29, $ACC4, $TEMP1
vpand $AND_MASK, $ACC4, $ACC4
vpsrlq \$29, $ACC5, $TEMP2
@@ -802,6 +811,7 @@ $code.=<<___;
vzeroall
mov %rbp, %rax
+.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
.Lsqr_1024_in_tail:
@@ -818,14 +828,22 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -48(%rax),%r15
+.cfi_restore %r15
mov -40(%rax),%r14
+.cfi_restore %r14
mov -32(%rax),%r13
+.cfi_restore %r13
mov -24(%rax),%r12
+.cfi_restore %r12
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lsqr_1024_epilogue:
ret
+.cfi_endproc
.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
___
}
@@ -878,13 +896,21 @@ $code.=<<___;
.type rsaz_1024_mul_avx2,\@function,5
.align 64
rsaz_1024_mul_avx2:
+.cfi_startproc
lea (%rsp), %rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
___
$code.=<<___ if ($win64);
vzeroupper
@@ -903,6 +929,7 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov %rax,%rbp
+.cfi_def_cfa_register %rbp
vzeroall
mov %rdx, $bp # reassigned argument
sub \$64,%rsp
@@ -1429,13 +1456,14 @@ $code.=<<___;
vpaddq $TEMP4, $ACC8, $ACC8
vmovdqu $ACC4, 128-128($rp)
- vmovdqu $ACC5, 160-128($rp)
+ vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC6, 192-128($rp)
vmovdqu $ACC7, 224-128($rp)
vmovdqu $ACC8, 256-128($rp)
vzeroupper
mov %rbp, %rax
+.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
.Lmul_1024_in_tail:
@@ -1452,14 +1480,22 @@ $code.=<<___ if ($win64);
___
$code.=<<___;
mov -48(%rax),%r15
+.cfi_restore %r15
mov -40(%rax),%r14
+.cfi_restore %r14
mov -32(%rax),%r13
+.cfi_restore %r13
mov -24(%rax),%r12
+.cfi_restore %r12
mov -16(%rax),%rbp
+.cfi_restore %rbp
mov -8(%rax),%rbx
+.cfi_restore %rbx
lea (%rax),%rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lmul_1024_epilogue:
ret
+.cfi_endproc
.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
___
}
@@ -1578,8 +1614,10 @@ rsaz_1024_scatter5_avx2:
.type rsaz_1024_gather5_avx2,\@abi-omnipotent
.align 32
rsaz_1024_gather5_avx2:
+.cfi_startproc
vzeroupper
mov %rsp,%r11
+.cfi_def_cfa_register %r11
___
$code.=<<___ if ($win64);
lea -0x88(%rsp),%rax
@@ -1717,11 +1755,13 @@ $code.=<<___ if ($win64);
movaps -0x38(%r11),%xmm13
movaps -0x28(%r11),%xmm14
movaps -0x18(%r11),%xmm15
-.LSEH_end_rsaz_1024_gather5:
___
$code.=<<___;
lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
ret
+.cfi_endproc
+.LSEH_end_rsaz_1024_gather5:
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
___
}
diff --git a/src/crypto/bn/asm/x86-mont.pl b/src/crypto/bn/asm/x86-mont.pl
index 4b5d05db..57fbf10b 100644..100755
--- a/src/crypto/bn/asm/x86-mont.pl
+++ b/src/crypto/bn/asm/x86-mont.pl
@@ -32,7 +32,7 @@ require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
-
+
&asm_init($ARGV[0],$0);
$sse2=0;
@@ -66,33 +66,57 @@ $frame=32; # size of above frame rounded up to 16n
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
- &mov ("ebp","esp"); # saved stack pointer!
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
- &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
+ &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arraning 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
- &mov ("eax","esp");
+ &mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
- &sub ("esp","eax"); # this aligns sp and ap modulo 2048
+ &sub ("ebp","eax"); # this aligns sp and ap modulo 2048
- &xor ("edx","esp");
+ &xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
- &sub ("esp","edx"); # this splits them apart modulo 4096
-
- &and ("esp",-64); # align to cache line
+ &sub ("ebp","edx"); # this splits them apart modulo 4096
+
+ &and ("ebp",-64); # align to cache line
+
+ # An OS-agnostic version of __chkstk.
+ #
+ # Some OSes (Windows) insist on stack being "wired" to
+ # physical memory in strictly sequential manner, i.e. if stack
+ # allocation spans two pages, then reference to farmost one can
+ # be punishable by SEGV. But page walking can do good even on
+ # other OSes, because it guarantees that villain thread hits
+ # the guard page before it can make damage to innocent one...
+ &mov ("eax","esp");
+ &sub ("eax","ebp");
+ &and ("eax",-4096);
+ &mov ("edx","esp"); # saved stack pointer!
+ &lea ("esp",&DWP(0,"ebp","eax"));
+ &mov ("eax",&DWP(0,"esp"));
+ &cmp ("esp","ebp");
+ &ja (&label("page_walk"));
+ &jmp (&label("page_walk_done"));
+
+&set_label("page_walk",16);
+ &lea ("esp",&DWP(-4096,"esp"));
+ &mov ("eax",&DWP(0,"esp"));
+ &cmp ("esp","ebp");
+ &ja (&label("page_walk"));
+&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
- &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+ &mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
@@ -100,11 +124,11 @@ $frame=32; # size of above frame rounded up to 16n
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
- &mov ($_np,"edx");
+ &mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
- &mov ($_sp,"ebp"); # saved stack pointer!
+ &mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
@@ -270,7 +294,7 @@ if (0) {
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
- # all key lengthes on modern Intel cores, it's still more
+ # all key lengths on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
@@ -573,15 +597,16 @@ $sbit=$num;
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
+ &and ($tp,"eax");
+ &not ("eax");
+ &mov ($np,$rp);
+ &and ($np,"eax");
+ &or ($tp,$np); # tp=carry?tp:rp
&set_label("copy",16); # copy or in-place refresh
- &mov ("edx",&DWP(0,$tp,$num,4));
- &mov ($np,&DWP(0,$rp,$num,4));
- &xor ("edx",$np); # conditional select
- &and ("edx","eax");
- &xor ("edx",$np);
- &mov (&DWP(0,$tp,$num,4),$j) # zap temporary vector
- &mov (&DWP(0,$rp,$num,4),"edx"); # rp[i]=tp[i]
+ &mov ("eax",&DWP(0,$tp,$num,4));
+ &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
+ &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&dec ($num);
&jge (&label("copy"));
diff --git a/src/crypto/bn/asm/x86_64-mont.pl b/src/crypto/bn/asm/x86_64-mont.pl
index 60e0111a..5775f658 100755
--- a/src/crypto/bn/asm/x86_64-mont.pl
+++ b/src/crypto/bn/asm/x86_64-mont.pl
@@ -84,6 +84,10 @@ $code=<<___;
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
+.cfi_startproc
+ mov ${num}d,${num}d
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
test \$3,${num}d
jnz .Lmul_enter
cmp \$8,${num}d
@@ -102,20 +106,50 @@ $code.=<<___;
.align 16
.Lmul_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
- mov ${num}d,${num}d
- lea 2($num),%r10
+ neg $num
mov %rsp,%r11
- neg %r10
- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
- and \$-1024,%rsp # minimize TLB usage
+ lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+ # An OS-agnostic version of __chkstk.
+ #
+ # Some OSes (Windows) insist on stack being "wired" to
+ # physical memory in strictly sequential manner, i.e. if stack
+ # allocation spans two pages, then reference to farmost one can
+ # be punishable by SEGV. But page walking can do good even on
+ # other OSes, because it guarantees that villain thread hits
+ # the guard page before it can make damage to innocent one...
+ sub %r10,%r11
+ and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
+.Lmul_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
.Lmul_body:
mov $bp,%r12 # reassign $bp
___
@@ -265,36 +299,46 @@ $code.=<<___;
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
- dec $j # doesn't affect CF!
+ dec $j # doesnn't affect CF!
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
xor $i,$i
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
mov $num,$j # j=num
+ or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
- mov (%rsp,$i,8),$ap
- mov ($rp,$i,8),$np
- xor $np,$ap # conditional select:
- and %rax,$ap # ((ap ^ np) & %rax) ^ np
- xor $np,$ap # ap = borrow?tp:rp
+ mov ($ap,$i,8),%rax
mov $i,(%rsp,$i,8) # zap temporary vector
- mov $ap,($rp,$i,8) # rp[i]=tp[i]
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul_epilogue:
ret
+.cfi_endproc
.size bn_mul_mont,.-bn_mul_mont
___
{{{
@@ -304,6 +348,10 @@ $code.=<<___;
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
+.cfi_startproc
+ mov ${num}d,${num}d
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -313,20 +361,41 @@ $code.=<<___ if ($addx);
___
$code.=<<___;
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
- mov ${num}d,${num}d
- lea 4($num),%r10
+ neg $num
mov %rsp,%r11
- neg %r10
- lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
- and \$-1024,%rsp # minimize TLB usage
+ lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
+ neg $num # restore
+ and \$-1024,%r10 # minimize TLB usage
- mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
+ sub %r10,%r11
+ and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
.Lmul4x_body:
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
@@ -633,9 +702,11 @@ ___
my @ri=("%rax","%rdx",$m0,$m1);
$code.=<<___;
mov 16(%rsp,$num,8),$rp # restore $rp
+ lea -4($num),$j
mov 0(%rsp),@ri[0] # tp[0]
+ pxor %xmm0,%xmm0
mov 8(%rsp),@ri[1] # tp[1]
- shr \$2,$num # num/=4
+ shr \$2,$j # j=num/4-1
lea (%rsp),$ap # borrow ap for tp
xor $i,$i # i=0 and clear CF!
@@ -643,7 +714,6 @@ $code.=<<___;
mov 16($ap),@ri[2] # tp[2]
mov 24($ap),@ri[3] # tp[3]
sbb 8($np),@ri[1]
- lea -1($num),$j # j=num/4-1
jmp .Lsub4x
.align 16
.Lsub4x:
@@ -671,50 +741,58 @@ $code.=<<___;
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb \$0,@ri[0] # handle upmost overflow bit
- mov @ri[0],%xmm0
- punpcklqdq %xmm0,%xmm0 # extend mask to 128 bits
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
xor $i,$i # i=0
-
- mov $num,$j
- pxor %xmm5,%xmm5
+ and @ri[0],$ap
+ not @ri[0]
+ mov $rp,$np
+ and @ri[0],$np
+ lea -4($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+ shr \$2,$j # j=num/4-1
+
+ movdqu ($ap),%xmm1
+ movdqa %xmm0,(%rsp)
+ movdqu %xmm1,($rp)
jmp .Lcopy4x
.align 16
-.Lcopy4x: # copy or in-place refresh
- movdqu (%rsp,$i),%xmm2
- movdqu 16(%rsp,$i),%xmm4
- movdqu ($rp,$i),%xmm1
- movdqu 16($rp,$i),%xmm3
- pxor %xmm1,%xmm2 # conditional select
- pxor %xmm3,%xmm4
- pand %xmm0,%xmm2
- pand %xmm0,%xmm4
- pxor %xmm1,%xmm2
- pxor %xmm3,%xmm4
- movdqu %xmm2,($rp,$i)
- movdqu %xmm4,16($rp,$i)
- movdqa %xmm5,(%rsp,$i) # zap temporary vectors
- movdqa %xmm5,16(%rsp,$i)
-
+.Lcopy4x: # copy or in-place refresh
+ movdqu 16($ap,$i),%xmm2
+ movdqu 32($ap,$i),%xmm1
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
+ movdqa %xmm0,32(%rsp,$i)
+ movdqu %xmm1,32($rp,$i)
lea 32($i),$i
dec $j
jnz .Lcopy4x
- shl \$2,$num
+ movdqu 16($ap,$i),%xmm2
+ movdqa %xmm0,16(%rsp,$i)
+ movdqu %xmm2,16($rp,$i)
___
}
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
+.cfi_def_cfa %rsi, 8
mov \$1,%rax
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
ret
+.cfi_endproc
.size bn_mul4x_mont,.-bn_mul4x_mont
___
}}}
@@ -742,14 +820,23 @@ $code.=<<___;
.type bn_sqr8x_mont,\@function,6
.align 32
bn_sqr8x_mont:
-.Lsqr8x_enter:
+.cfi_startproc
mov %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lsqr8x_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lsqr8x_prologue:
mov ${num}d,%r10d
shl \$3,${num}d # convert $num to bytes
@@ -762,30 +849,49 @@ bn_sqr8x_mont:
# do its job.
#
lea -64(%rsp,$num,2),%r11
+ mov %rsp,%rbp
mov ($n0),$n0 # *n0
sub $aptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lsqr8x_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ sub %r11,%rbp # align with $aptr
+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
- lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
+ lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lsqr8x_sp_done:
- and \$-64,%rsp
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
+.Lsqr8x_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
mov $num,%r10
neg $num
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lsqr8x_body:
movq $nptr, %xmm2 # save pointer to modulus
@@ -855,6 +961,7 @@ $code.=<<___;
pxor %xmm0,%xmm0
pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
jmp .Lsqr8x_cond_copy
.align 32
@@ -884,14 +991,22 @@ $code.=<<___;
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lsqr8x_epilogue:
ret
+.cfi_endproc
.size bn_sqr8x_mont,.-bn_sqr8x_mont
___
}}}
@@ -903,23 +1018,48 @@ $code.=<<___;
.type bn_mulx4x_mont,\@function,6
.align 32
bn_mulx4x_mont:
-.Lmulx4x_enter:
+.cfi_startproc
mov %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
- .byte 0x67
xor %r10,%r10
sub $num,%r10 # -$num
mov ($n0),$n0 # *n0
- lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
+ lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
+ and \$-128,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
+.Lmulx4x_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
lea ($bp,$num),%r10
- and \$-128,%rsp
##############################################################
# Stack layout
# +0 num
@@ -939,6 +1079,7 @@ bn_mulx4x_mont:
mov $n0, 24(%rsp) # save *n0
mov $rp, 32(%rsp) # save $rp
mov %rax,40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
mov $num,48(%rsp) # inner counter
jmp .Lmulx4x_body
@@ -1188,6 +1329,7 @@ $code.=<<___;
pxor %xmm0,%xmm0
pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
jmp .Lmulx4x_cond_copy
.align 32
@@ -1217,14 +1359,22 @@ $code.=<<___;
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmulx4x_epilogue:
ret
+.cfi_endproc
.size bn_mulx4x_mont,.-bn_mulx4x_mont
___
}}}
@@ -1277,22 +1427,8 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
- jmp .Lcommon_seh_tail
+ jmp .Lcommon_pop_regs
.size mul_handler,.-mul_handler
.type sqr_handler,\@abi-omnipotent
@@ -1317,18 +1453,24 @@ sqr_handler:
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
- cmp %r10,%rbx # context->Rip<.Lsqr_body
+ cmp %r10,%rbx # context->Rip<.Lsqr_prologue
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # body label
+ cmp %r10,%rbx # context->Rip<.Lsqr_body
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
jae .Lcommon_seh_tail
mov 40(%rax),%rax # pull saved stack pointer
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -1415,13 +1557,15 @@ $code.=<<___;
.LSEH_info_bn_sqr8x_mont:
.byte 9,0,0,0
.rva sqr_handler
- .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+ .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
+.align 8
___
$code.=<<___ if ($addx);
.LSEH_info_bn_mulx4x_mont:
.byte 9,0,0,0
.rva sqr_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+.align 8
___
}
diff --git a/src/crypto/bn/asm/x86_64-mont5.pl b/src/crypto/bn/asm/x86_64-mont5.pl
index 61fde2d2..bf68aadd 100755
--- a/src/crypto/bn/asm/x86_64-mont5.pl
+++ b/src/crypto/bn/asm/x86_64-mont5.pl
@@ -73,6 +73,10 @@ $code=<<___;
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
+.cfi_startproc
+ mov ${num}d,${num}d
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
test \$7,${num}d
jnz .Lmul_enter
___
@@ -84,24 +88,54 @@ $code.=<<___;
.align 16
.Lmul_enter:
- mov ${num}d,${num}d
- mov %rsp,%rax
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
- lea .Linc(%rip),%r10
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
- lea 2($num),%r11
- neg %r11
- lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
- and \$-1024,%rsp # minimize TLB usage
+ neg $num
+ mov %rsp,%r11
+ lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
+
+ # An OS-agnostic version of __chkstk.
+ #
+ # Some OSes (Windows) insist on stack being "wired" to
+ # physical memory in strictly sequential manner, i.e. if stack
+ # allocation spans two pages, then reference to farmost one can
+ # be punishable by SEGV. But page walking can do good even on
+ # other OSes, because it guarantees that villain thread hits
+ # the guard page before it can make damage to innocent one...
+ sub %r10,%r11
+ and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.Lmul_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+ lea .Linc(%rip),%r10
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
.Lmul_body:
+
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
$bp="%r12";
@@ -370,32 +404,42 @@ $code.=<<___;
sbb \$0,%rax # handle upmost overflow bit
xor $i,$i
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
mov $num,$j # j=num
+ or $np,$ap # ap=borrow?tp:rp
.align 16
.Lcopy: # copy or in-place refresh
- mov (%rsp,$i,8),$ap
- mov ($rp,$i,8),$np
- xor $np,$ap # conditional select:
- and %rax,$ap # ((ap ^ np) & %rax) ^ np
- xor $np,$ap # ap = borrow?tp:rp
+ mov ($ap,$i,8),%rax
mov $i,(%rsp,$i,8) # zap temporary vector
- mov $ap,($rp,$i,8) # rp[i]=tp[i]
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul_epilogue:
ret
+.cfi_endproc
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
___
{{{
@@ -405,6 +449,10 @@ $code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
+.cfi_startproc
+ .byte 0x67
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
@@ -413,14 +461,19 @@ $code.=<<___ if ($addx);
je .Lmulx4x_enter
___
$code.=<<___;
- .byte 0x67
- mov %rsp,%rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lmul4x_prologue:
.byte 0x67
shl \$3,${num}d # convert $num to bytes
@@ -437,43 +490,70 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $rp
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $rp
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lmul4xsp_done:
- and \$-64,%rsp
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
neg $num
mov %rax,40(%rsp)
+.cfi_cfa_expression %rsp+40,deref,+8
.Lmul4x_body:
call mul4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
ret
+.cfi_endproc
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.type mul4x_internal,\@abi-omnipotent
@@ -985,7 +1065,7 @@ my $bptr="%rdx"; # const void *table,
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
- # int pwr
+ # int pwr
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
@@ -997,6 +1077,9 @@ $code.=<<___;
.type bn_power5,\@function,6
.align 32
bn_power5:
+.cfi_startproc
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1005,13 +1088,19 @@ $code.=<<___ if ($addx);
je .Lpowerx5_enter
___
$code.=<<___;
- mov %rsp,%rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lpower5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
@@ -1026,25 +1115,42 @@ $code.=<<___;
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwr_sp_done:
- and \$-64,%rsp
- mov $num,%r10
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+ mov $num,%r10
neg $num
##############################################################
@@ -1058,6 +1164,7 @@ $code.=<<___;
#
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lpower5_body:
movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
@@ -1084,16 +1191,25 @@ $code.=<<___;
call mul4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lpower5_epilogue:
ret
+.cfi_endproc
.size bn_power5,.-bn_power5
.globl bn_sqr8x_internal
@@ -1953,7 +2069,7 @@ __bn_post4x_internal:
jnz .Lsqr4x_sub
mov $num,%r10 # prepare for back-to-back call
- neg $num # restore $num
+ neg $num # restore $num
ret
.size __bn_post4x_internal,.-__bn_post4x_internal
___
@@ -1973,14 +2089,23 @@ bn_from_montgomery:
.type bn_from_mont8x,\@function,6
.align 32
bn_from_mont8x:
+.cfi_startproc
.byte 0x67
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lfrom_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -1995,25 +2120,42 @@ bn_from_mont8x:
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lfrom_sp_done:
- and \$-64,%rsp
- mov $num,%r10
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+ jmp .Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+.Lfrom_page_walk_done:
+
+ mov $num,%r10
neg $num
##############################################################
@@ -2027,6 +2169,7 @@ bn_from_mont8x:
#
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lfrom_body:
mov $num,%r11
lea 48(%rsp),%rax
@@ -2070,7 +2213,6 @@ $code.=<<___ if ($addx);
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
- mov 40(%rsp),%rsi # restore %rsp
jmp .Lfrom_mont_zero
.align 32
@@ -2082,11 +2224,12 @@ $code.=<<___;
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
- mov 40(%rsp),%rsi # restore %rsp
jmp .Lfrom_mont_zero
.align 32
.Lfrom_mont_zero:
+ mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
movdqa %xmm0,16*0(%rax)
movdqa %xmm0,16*1(%rax)
movdqa %xmm0,16*2(%rax)
@@ -2097,14 +2240,22 @@ $code.=<<___;
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lfrom_epilogue:
ret
+.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
___
}
@@ -2117,14 +2268,23 @@ $code.=<<___;
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
+.cfi_startproc
mov %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2141,23 +2301,40 @@ bn_mulx4x_mont_gather5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
-.Lmulx4xsp_done:
- and \$-64,%rsp # ensure alignment
+ sub %r11,%rbp
+.Lmulx4xsp_done:
+ and \$-64,%rbp # ensure alignment
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
##############################################################
# Stack layout
# +0 -num
@@ -2172,21 +2349,31 @@ bn_mulx4x_mont_gather5:
#
mov $n0, 32(%rsp) # save *n0
mov %rax,40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lmulx4x_body:
call mulx4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmulx4x_epilogue:
ret
+.cfi_endproc
.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
.type mulx4x_internal,\@abi-omnipotent
@@ -2564,14 +2751,23 @@ $code.=<<___;
.type bn_powerx5,\@function,6
.align 32
bn_powerx5:
-.Lpowerx5_enter:
+.cfi_startproc
mov %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lpowerx5_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lpowerx5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
@@ -2586,25 +2782,42 @@ bn_powerx5:
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwrx_sp_done:
- and \$-64,%rsp
- mov $num,%r10
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
+ and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+ mov $num,%r10
neg $num
##############################################################
@@ -2625,6 +2838,7 @@ bn_powerx5:
movq $bptr,%xmm4
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lpowerx5_body:
call __bn_sqrx8x_internal
@@ -2647,17 +2861,26 @@ bn_powerx5:
call mulx4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lpowerx5_epilogue:
ret
+.cfi_endproc
.size bn_powerx5,.-bn_powerx5
.globl bn_sqrx8x_internal
@@ -3513,9 +3736,14 @@ mul_handler:
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # beginning of body label
+ cmp %r10,%rbx # context->Rip<body label
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
@@ -3527,11 +3755,11 @@ mul_handler:
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- jmp .Lbody_proceed
+ jmp .Lcommon_pop_regs
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
@@ -3622,34 +3850,34 @@ $code.=<<___;
.LSEH_info_bn_mul_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+ .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_mul4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+ .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_power5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
+ .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+ .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
.LSEH_info_bn_mulx4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_powerx5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
+ .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
___
$code.=<<___;
.align 8
diff --git a/src/crypto/chacha/CMakeLists.txt b/src/crypto/chacha/CMakeLists.txt
index 39d1defb..63de0611 100644
--- a/src/crypto/chacha/CMakeLists.txt
+++ b/src/crypto/chacha/CMakeLists.txt
@@ -42,17 +42,7 @@ add_library(
${CHACHA_ARCH_SOURCES}
)
-add_executable(
- chacha_test
-
- chacha_test.cc
- $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(chacha_test crypto)
-add_dependencies(all_tests chacha_test)
-
perlasm(chacha-armv4.${ASM_EXT} asm/chacha-armv4.pl)
perlasm(chacha-armv8.${ASM_EXT} asm/chacha-armv8.pl)
perlasm(chacha-x86.${ASM_EXT} asm/chacha-x86.pl)
-perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl) \ No newline at end of file
+perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl)
diff --git a/src/crypto/chacha/asm/chacha-armv4.pl b/src/crypto/chacha/asm/chacha-armv4.pl
index 395b8154..13698e3a 100755
--- a/src/crypto/chacha/asm/chacha-armv4.pl
+++ b/src/crypto/chacha/asm/chacha-armv4.pl
@@ -8,7 +8,7 @@
# ====================================================================
#
# December 2014
-#
+#
# ChaCha20 for ARMv4.
#
# Performance in cycles per byte out of large buffer.
@@ -713,7 +713,7 @@ ChaCha20_neon:
vadd.i32 $d2,$d1,$t0 @ counter+2
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
- add @x[12],@x[12],#3 @ counter+3
+ add @x[12],@x[12],#3 @ counter+3
b .Loop_neon
.align 4
@@ -1127,7 +1127,7 @@ $code.=<<___;
ldrb @t[1],[r12],#1 @ read input
subs @t[3],@t[3],#1
eor @t[0],@t[0],@t[1]
- strb @t[0],[r14],#1 @ store ouput
+ strb @t[0],[r14],#1 @ store output
bne .Loop_tail_neon
.Ldone_neon:
diff --git a/src/crypto/chacha/asm/chacha-armv8.pl b/src/crypto/chacha/asm/chacha-armv8.pl
index 215d9657..c2d04298 100755
--- a/src/crypto/chacha/asm/chacha-armv8.pl
+++ b/src/crypto/chacha/asm/chacha-armv8.pl
@@ -8,7 +8,7 @@
# ====================================================================
#
# June 2015
-#
+#
# ChaCha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
@@ -193,7 +193,7 @@ ChaCha20_ctr32:
mov $ctr,#10
subs $len,$len,#64
.Loop:
- sub $ctr,$ctr,#1
+ sub $ctr,$ctr,#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }
diff --git a/src/crypto/chacha/asm/chacha-x86.pl b/src/crypto/chacha/asm/chacha-x86.pl
index 984ce11e..f8bbb76d 100755
--- a/src/crypto/chacha/asm/chacha-x86.pl
+++ b/src/crypto/chacha/asm/chacha-x86.pl
@@ -21,7 +21,9 @@
# Westmere 9.50/+45% 3.35
# Sandy Bridge 10.5/+47% 3.20
# Haswell 8.15/+50% 2.83
+# Skylake 7.53/+22% 2.75
# Silvermont 17.4/+36% 8.35
+# Goldmont 13.4/+40% 4.36
# Sledgehammer 10.2/+54%
# Bulldozer 13.4/+50% 4.38(*)
#
@@ -38,10 +40,8 @@ open STDOUT,">$output";
&asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
-$xmm=$ymm=0;
-for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-$ymm=$xmm;
+$xmm=$ymm=1;
+$gasver=999; # enable everything
$a="eax";
($b,$b_)=("ebx","ebp");
@@ -438,6 +438,12 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
&label("pic_point"),"eax"));
&movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
+if (defined($gasver) && $gasver>=2.17) { # even though we encode
+ # pshufb manually, we
+ # handle only register
+ # operands, while this
+ # segment uses memory
+ # operand...
&cmp ($len,64*4);
&jb (&label("1x"));
@@ -619,6 +625,7 @@ my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
&paddd ("xmm2",&QWP(16*6,"eax")); # +four
&pand ("xmm3",&QWP(16*7,"eax"));
&por ("xmm3","xmm2"); # counter value
+}
{
my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
diff --git a/src/crypto/chacha/asm/chacha-x86_64.pl b/src/crypto/chacha/asm/chacha-x86_64.pl
index 55b726d2..5ab6f879 100755
--- a/src/crypto/chacha/asm/chacha-x86_64.pl
+++ b/src/crypto/chacha/asm/chacha-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -11,6 +18,10 @@
#
# ChaCha20 for x86_64.
#
+# December 2016
+#
+# Add AVX512F code path.
+#
# Performance in cycles per byte out of large buffer.
#
# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
@@ -21,7 +32,9 @@
# Sandy Bridge 8.31/+42% 5.45/6.76 2.72
# Ivy Bridge 6.71/+46% 5.40/6.49 2.41
# Haswell 5.92/+43% 5.20/6.45 2.42 1.23
+# Skylake 5.87/+39% 4.70/- 2.31 1.19
# Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
+# Goldmont 10.6/+17% 5.10/- 3.28
# Sledgehammer 7.28/+52% -/14.2(ii) -
# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
# VIA Nano 10.5/+46% 6.72/8.60 6.05
@@ -82,6 +95,15 @@ $code.=<<___;
.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
.Lsigma:
.asciz "expand 32-byte k"
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
@@ -207,6 +229,12 @@ ChaCha20_ctr32:
cmp \$0,$len
je .Lno_data
mov OPENSSL_ia32cap_P+4(%rip),%r10
+___
+$code.=<<___ if ($avx>2);
+ bt \$48,%r10 # check for AVX512F
+ jc .LChaCha20_avx512
+___
+$code.=<<___;
test \$`1<<(41-32)`,%r10d
jnz .LChaCha20_ssse3
@@ -217,6 +245,7 @@ ChaCha20_ctr32:
push %r14
push %r15
sub \$64+24,%rsp
+.Lctr32_body:
#movdqa .Lsigma(%rip),%xmm0
movdqu ($key),%xmm1
@@ -355,13 +384,14 @@ $code.=<<___;
jnz .Loop_tail
.Ldone:
- add \$64+24,%rsp
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
+ lea 64+24+48(%rsp),%rsi
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lno_data:
ret
.size ChaCha20_ctr32,.-ChaCha20_ctr32
@@ -396,31 +426,26 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
&por ($b,$t);
}
-my $xframe = $win64 ? 32+32+8 : 24;
+my $xframe = $win64 ? 32+8 : 8;
$code.=<<___;
.type ChaCha20_ssse3,\@function,5
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
+ mov %rsp,%r9 # frame pointer
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
ja .LChaCha20_4x # but overall it won't be slower
.Ldo_sse3_after_all:
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-
sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,64+32(%rsp)
- movaps %xmm7,64+48(%rsp)
+ movaps %xmm6,-0x28(%r9)
+ movaps %xmm7,-0x18(%r9)
+.Lssse3_body:
___
$code.=<<___;
movdqa .Lsigma(%rip),$a
@@ -434,7 +459,7 @@ $code.=<<___;
movdqa $b,0x10(%rsp)
movdqa $c,0x20(%rsp)
movdqa $d,0x30(%rsp)
- mov \$10,%ebp
+ mov \$10,$counter # reuse $counter
jmp .Loop_ssse3
.align 32
@@ -444,7 +469,7 @@ $code.=<<___;
movdqa 0x10(%rsp),$b
movdqa 0x20(%rsp),$c
paddd 0x30(%rsp),$d
- mov \$10,%ebp
+ mov \$10,$counter
movdqa $d,0x30(%rsp)
jmp .Loop_ssse3
@@ -462,7 +487,7 @@ ___
&pshufd ($b,$b,0b10010011);
&pshufd ($d,$d,0b00111001);
- &dec ("%ebp");
+ &dec ($counter);
&jnz (".Loop_ssse3");
$code.=<<___;
@@ -501,31 +526,26 @@ $code.=<<___;
movdqa $b,0x10(%rsp)
movdqa $c,0x20(%rsp)
movdqa $d,0x30(%rsp)
- xor %rbx,%rbx
+ xor $counter,$counter
.Loop_tail_ssse3:
- movzb ($inp,%rbx),%eax
- movzb (%rsp,%rbx),%ecx
- lea 1(%rbx),%rbx
+ movzb ($inp,$counter),%eax
+ movzb (%rsp,$counter),%ecx
+ lea 1($counter),$counter
xor %ecx,%eax
- mov %al,-1($out,%rbx)
+ mov %al,-1($out,$counter)
dec $len
jnz .Loop_tail_ssse3
.Ldone_ssse3:
___
$code.=<<___ if ($win64);
- movaps 64+32(%rsp),%xmm6
- movaps 64+48(%rsp),%xmm7
+ movaps -0x28(%r9),%xmm6
+ movaps -0x18(%r9),%xmm7
___
$code.=<<___;
- add \$64+$xframe,%rsp
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
+ lea (%r9),%rsp
+.Lssse3_epilogue:
ret
.size ChaCha20_ssse3,.-ChaCha20_ssse3
___
@@ -662,13 +682,14 @@ my @x=map("\"$_\"",@xx);
);
}
-my $xframe = $win64 ? 0xa0 : 0;
+my $xframe = $win64 ? 0xa8 : 8;
$code.=<<___;
.type ChaCha20_4x,\@function,5
.align 32
ChaCha20_4x:
.LChaCha20_4x:
+ mov %rsp,%r9 # frame pointer
mov %r10,%r11
___
$code.=<<___ if ($avx>1);
@@ -685,8 +706,7 @@ $code.=<<___;
je .Ldo_sse3_after_all # to detect Atom
.Lproceed4x:
- lea -0x78(%rsp),%r11
- sub \$0x148+$xframe,%rsp
+ sub \$0x140+$xframe,%rsp
___
################ stack layout
# +0x00 SIMD equivalent of @x[8-12]
@@ -697,16 +717,17 @@ ___
# ...
# +0x140
$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r11)
- movaps %xmm7,-0x20(%r11)
- movaps %xmm8,-0x10(%r11)
- movaps %xmm9,0x00(%r11)
- movaps %xmm10,0x10(%r11)
- movaps %xmm11,0x20(%r11)
- movaps %xmm12,0x30(%r11)
- movaps %xmm13,0x40(%r11)
- movaps %xmm14,0x50(%r11)
- movaps %xmm15,0x60(%r11)
+ movaps %xmm6,-0xa8(%r9)
+ movaps %xmm7,-0x98(%r9)
+ movaps %xmm8,-0x88(%r9)
+ movaps %xmm9,-0x78(%r9)
+ movaps %xmm10,-0x68(%r9)
+ movaps %xmm11,-0x58(%r9)
+ movaps %xmm12,-0x48(%r9)
+ movaps %xmm13,-0x38(%r9)
+ movaps %xmm14,-0x28(%r9)
+ movaps %xmm15,-0x18(%r9)
+.L4x_body:
___
$code.=<<___;
movdqa .Lsigma(%rip),$xa3 # key[0]
@@ -1095,20 +1116,20 @@ $code.=<<___;
.Ldone4x:
___
$code.=<<___ if ($win64);
- lea 0x140+0x30(%rsp),%r11
- movaps -0x30(%r11),%xmm6
- movaps -0x20(%r11),%xmm7
- movaps -0x10(%r11),%xmm8
- movaps 0x00(%r11),%xmm9
- movaps 0x10(%r11),%xmm10
- movaps 0x20(%r11),%xmm11
- movaps 0x30(%r11),%xmm12
- movaps 0x40(%r11),%xmm13
- movaps 0x50(%r11),%xmm14
- movaps 0x60(%r11),%xmm15
+ movaps -0xa8(%r9),%xmm6
+ movaps -0x98(%r9),%xmm7
+ movaps -0x88(%r9),%xmm8
+ movaps -0x78(%r9),%xmm9
+ movaps -0x68(%r9),%xmm10
+ movaps -0x58(%r9),%xmm11
+ movaps -0x48(%r9),%xmm12
+ movaps -0x38(%r9),%xmm13
+ movaps -0x28(%r9),%xmm14
+ movaps -0x18(%r9),%xmm15
___
$code.=<<___;
- add \$0x148+$xframe,%rsp
+ lea (%r9),%rsp
+.L4x_epilogue:
ret
.size ChaCha20_4x,.-ChaCha20_4x
___
@@ -1236,33 +1257,32 @@ my @x=map("\"$_\"",@xx);
);
}
-my $xframe = $win64 ? 0xb0 : 8;
+my $xframe = $win64 ? 0xa8 : 8;
$code.=<<___;
.type ChaCha20_8x,\@function,5
.align 32
ChaCha20_8x:
.LChaCha20_8x:
- mov %rsp,%r10
+ mov %rsp,%r9 # frame register
sub \$0x280+$xframe,%rsp
and \$-32,%rsp
___
$code.=<<___ if ($win64);
- lea 0x290+0x30(%rsp),%r11
- movaps %xmm6,-0x30(%r11)
- movaps %xmm7,-0x20(%r11)
- movaps %xmm8,-0x10(%r11)
- movaps %xmm9,0x00(%r11)
- movaps %xmm10,0x10(%r11)
- movaps %xmm11,0x20(%r11)
- movaps %xmm12,0x30(%r11)
- movaps %xmm13,0x40(%r11)
- movaps %xmm14,0x50(%r11)
- movaps %xmm15,0x60(%r11)
+ movaps %xmm6,-0xa8(%r9)
+ movaps %xmm7,-0x98(%r9)
+ movaps %xmm8,-0x88(%r9)
+ movaps %xmm9,-0x78(%r9)
+ movaps %xmm10,-0x68(%r9)
+ movaps %xmm11,-0x58(%r9)
+ movaps %xmm12,-0x48(%r9)
+ movaps %xmm13,-0x38(%r9)
+ movaps %xmm14,-0x28(%r9)
+ movaps %xmm15,-0x18(%r9)
+.L8x_body:
___
$code.=<<___;
vzeroupper
- mov %r10,0x280(%rsp)
################ stack layout
# +0x00 SIMD equivalent of @x[8-12]
@@ -1271,7 +1291,7 @@ $code.=<<___;
# ...
# +0x200 SIMD counters (with nonce smashed by lanes)
# ...
- # +0x280 saved %rsp
+ # +0x280
vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
vbroadcasti128 ($key),$xb3 # key[1]
@@ -1737,29 +1757,989 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
- lea 0x290+0x30(%rsp),%r11
- movaps -0x30(%r11),%xmm6
- movaps -0x20(%r11),%xmm7
- movaps -0x10(%r11),%xmm8
- movaps 0x00(%r11),%xmm9
- movaps 0x10(%r11),%xmm10
- movaps 0x20(%r11),%xmm11
- movaps 0x30(%r11),%xmm12
- movaps 0x40(%r11),%xmm13
- movaps 0x50(%r11),%xmm14
- movaps 0x60(%r11),%xmm15
+ movaps -0xa8(%r9),%xmm6
+ movaps -0x98(%r9),%xmm7
+ movaps -0x88(%r9),%xmm8
+ movaps -0x78(%r9),%xmm9
+ movaps -0x68(%r9),%xmm10
+ movaps -0x58(%r9),%xmm11
+ movaps -0x48(%r9),%xmm12
+ movaps -0x38(%r9),%xmm13
+ movaps -0x28(%r9),%xmm14
+ movaps -0x18(%r9),%xmm15
___
$code.=<<___;
- mov 0x280(%rsp),%rsp
+ lea (%r9),%rsp
+.L8x_epilogue:
ret
.size ChaCha20_8x,.-ChaCha20_8x
___
}
+########################################################################
+# AVX512 code paths
+if ($avx>2) {
+# This one handles shorter inputs...
+
+my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
+my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
+ &vpaddd ($a,$a,$b);
+ &vpxord ($d,$d,$a);
+ &vprold ($d,$d,16);
+
+ &vpaddd ($c,$c,$d);
+ &vpxord ($b,$b,$c);
+ &vprold ($b,$b,12);
+
+ &vpaddd ($a,$a,$b);
+ &vpxord ($d,$d,$a);
+ &vprold ($d,$d,8);
+
+ &vpaddd ($c,$c,$d);
+ &vpxord ($b,$b,$c);
+ &vprold ($b,$b,7);
+}
+
+my $xframe = $win64 ? 32+8 : 8;
+
+$code.=<<___;
+.type ChaCha20_avx512,\@function,5
+.align 32
+ChaCha20_avx512:
+.LChaCha20_avx512:
+ mov %rsp,%r9 # frame pointer
+ cmp \$512,$len
+ ja .LChaCha20_16x
+
+ sub \$64+$xframe,%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0x28(%r9)
+ movaps %xmm7,-0x18(%r9)
+.Lavx512_body:
+___
+$code.=<<___;
+ vbroadcasti32x4 .Lsigma(%rip),$a
+ vbroadcasti32x4 ($key),$b
+ vbroadcasti32x4 16($key),$c
+ vbroadcasti32x4 ($counter),$d
+
+ vmovdqa32 $a,$a_
+ vmovdqa32 $b,$b_
+ vmovdqa32 $c,$c_
+ vpaddd .Lzeroz(%rip),$d,$d
+ vmovdqa32 .Lfourz(%rip),$fourz
+ mov \$10,$counter # reuse $counter
+ vmovdqa32 $d,$d_
+ jmp .Loop_avx512
+
+.align 16
+.Loop_outer_avx512:
+ vmovdqa32 $a_,$a
+ vmovdqa32 $b_,$b
+ vmovdqa32 $c_,$c
+ vpaddd $fourz,$d_,$d
+ mov \$10,$counter
+ vmovdqa32 $d,$d_
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+___
+ &AVX512ROUND();
+ &vpshufd ($c,$c,0b01001110);
+ &vpshufd ($b,$b,0b00111001);
+ &vpshufd ($d,$d,0b10010011);
+
+ &AVX512ROUND();
+ &vpshufd ($c,$c,0b01001110);
+ &vpshufd ($b,$b,0b10010011);
+ &vpshufd ($d,$d,0b00111001);
+
+ &dec ($counter);
+ &jnz (".Loop_avx512");
+
+$code.=<<___;
+ vpaddd $a_,$a,$a
+ vpaddd $b_,$b,$b
+ vpaddd $c_,$c,$c
+ vpaddd $d_,$d,$d
+
+ sub \$64,$len
+ jb .Ltail64_avx512
+
+ vpxor 0x00($inp),%x#$a,$t0 # xor with input
+ vpxor 0x10($inp),%x#$b,$t1
+ vpxor 0x20($inp),%x#$c,$t2
+ vpxor 0x30($inp),%x#$d,$t3
+ lea 0x40($inp),$inp # inp+=64
+
+ vmovdqu $t0,0x00($out) # write output
+ vmovdqu $t1,0x10($out)
+ vmovdqu $t2,0x20($out)
+ vmovdqu $t3,0x30($out)
+ lea 0x40($out),$out # out+=64
+
+ jz .Ldone_avx512
+
+ vextracti32x4 \$1,$a,$t0
+ vextracti32x4 \$1,$b,$t1
+ vextracti32x4 \$1,$c,$t2
+ vextracti32x4 \$1,$d,$t3
+
+ sub \$64,$len
+ jb .Ltail_avx512
+
+ vpxor 0x00($inp),$t0,$t0 # xor with input
+ vpxor 0x10($inp),$t1,$t1
+ vpxor 0x20($inp),$t2,$t2
+ vpxor 0x30($inp),$t3,$t3
+ lea 0x40($inp),$inp # inp+=64
+
+ vmovdqu $t0,0x00($out) # write output
+ vmovdqu $t1,0x10($out)
+ vmovdqu $t2,0x20($out)
+ vmovdqu $t3,0x30($out)
+ lea 0x40($out),$out # out+=64
+
+ jz .Ldone_avx512
+
+ vextracti32x4 \$2,$a,$t0
+ vextracti32x4 \$2,$b,$t1
+ vextracti32x4 \$2,$c,$t2
+ vextracti32x4 \$2,$d,$t3
+
+ sub \$64,$len
+ jb .Ltail_avx512
+
+ vpxor 0x00($inp),$t0,$t0 # xor with input
+ vpxor 0x10($inp),$t1,$t1
+ vpxor 0x20($inp),$t2,$t2
+ vpxor 0x30($inp),$t3,$t3
+ lea 0x40($inp),$inp # inp+=64
+
+ vmovdqu $t0,0x00($out) # write output
+ vmovdqu $t1,0x10($out)
+ vmovdqu $t2,0x20($out)
+ vmovdqu $t3,0x30($out)
+ lea 0x40($out),$out # out+=64
+
+ jz .Ldone_avx512
+
+ vextracti32x4 \$3,$a,$t0
+ vextracti32x4 \$3,$b,$t1
+ vextracti32x4 \$3,$c,$t2
+ vextracti32x4 \$3,$d,$t3
+
+ sub \$64,$len
+ jb .Ltail_avx512
+
+ vpxor 0x00($inp),$t0,$t0 # xor with input
+ vpxor 0x10($inp),$t1,$t1
+ vpxor 0x20($inp),$t2,$t2
+ vpxor 0x30($inp),$t3,$t3
+ lea 0x40($inp),$inp # inp+=64
+
+ vmovdqu $t0,0x00($out) # write output
+ vmovdqu $t1,0x10($out)
+ vmovdqu $t2,0x20($out)
+ vmovdqu $t3,0x30($out)
+ lea 0x40($out),$out # out+=64
+
+ jnz .Loop_outer_avx512
+
+ jmp .Ldone_avx512
+
+.align 16
+.Ltail64_avx512:
+ vmovdqa %x#$a,0x00(%rsp)
+ vmovdqa %x#$b,0x10(%rsp)
+ vmovdqa %x#$c,0x20(%rsp)
+ vmovdqa %x#$d,0x30(%rsp)
+ add \$64,$len
+ jmp .Loop_tail_avx512
+
+.align 16
+.Ltail_avx512:
+ vmovdqa $t0,0x00(%rsp)
+ vmovdqa $t1,0x10(%rsp)
+ vmovdqa $t2,0x20(%rsp)
+ vmovdqa $t3,0x30(%rsp)
+ add \$64,$len
+
+.Loop_tail_avx512:
+ movzb ($inp,$counter),%eax
+ movzb (%rsp,$counter),%ecx
+ lea 1($counter),$counter
+ xor %ecx,%eax
+ mov %al,-1($out,$counter)
+ dec $len
+ jnz .Loop_tail_avx512
+
+ vmovdqa32 $a_,0x00(%rsp)
+
+.Ldone_avx512:
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps -0x28(%r9),%xmm6
+ movaps -0x18(%r9),%xmm7
+___
+$code.=<<___;
+ lea (%r9),%rsp
+.Lavx512_epilogue:
+ ret
+.size ChaCha20_avx512,.-ChaCha20_avx512
+___
+}
+if ($avx>2) {
+# This one handles longer inputs...
+
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
+my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
+my @key=map("%zmm$_",(16..31));
+my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
+
+sub AVX512_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("\"$_\"",@xx);
+
+ (
+ "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
+ "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
+ "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
+ "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
+ "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
+ "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
+ "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
+ "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
+ "&vprold (@x[$d0],@x[$d0],16)",
+ "&vprold (@x[$d1],@x[$d1],16)",
+ "&vprold (@x[$d2],@x[$d2],16)",
+ "&vprold (@x[$d3],@x[$d3],16)",
+
+ "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
+ "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
+ "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
+ "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
+ "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
+ "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
+ "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
+ "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
+ "&vprold (@x[$b0],@x[$b0],12)",
+ "&vprold (@x[$b1],@x[$b1],12)",
+ "&vprold (@x[$b2],@x[$b2],12)",
+ "&vprold (@x[$b3],@x[$b3],12)",
+
+ "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
+ "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
+ "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
+ "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
+ "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
+ "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
+ "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
+ "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
+ "&vprold (@x[$d0],@x[$d0],8)",
+ "&vprold (@x[$d1],@x[$d1],8)",
+ "&vprold (@x[$d2],@x[$d2],8)",
+ "&vprold (@x[$d3],@x[$d3],8)",
+
+ "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
+ "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
+ "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
+ "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
+ "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
+ "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
+ "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
+ "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
+ "&vprold (@x[$b0],@x[$b0],7)",
+ "&vprold (@x[$b1],@x[$b1],7)",
+ "&vprold (@x[$b2],@x[$b2],7)",
+ "&vprold (@x[$b3],@x[$b3],7)"
+ );
+}
+
+my $xframe = $win64 ? 0xa8 : 8;
+
+$code.=<<___;
+.type ChaCha20_16x,\@function,5
+.align 32
+ChaCha20_16x:
+.LChaCha20_16x:
+ mov %rsp,%r9 # frame register
+ sub \$64+$xframe,%rsp
+ and \$-64,%rsp
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%r9)
+ movaps %xmm7,-0x98(%r9)
+ movaps %xmm8,-0x88(%r9)
+ movaps %xmm9,-0x78(%r9)
+ movaps %xmm10,-0x68(%r9)
+ movaps %xmm11,-0x58(%r9)
+ movaps %xmm12,-0x48(%r9)
+ movaps %xmm13,-0x38(%r9)
+ movaps %xmm14,-0x28(%r9)
+ movaps %xmm15,-0x18(%r9)
+.L16x_body:
+___
+$code.=<<___;
+ vzeroupper
+
+ lea .Lsigma(%rip),%r10
+ vbroadcasti32x4 (%r10),$xa3 # key[0]
+ vbroadcasti32x4 ($key),$xb3 # key[1]
+ vbroadcasti32x4 16($key),$xc3 # key[2]
+ vbroadcasti32x4 ($counter),$xd3 # key[3]
+
+ vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
+ vpshufd \$0x55,$xa3,$xa1
+ vpshufd \$0xaa,$xa3,$xa2
+ vpshufd \$0xff,$xa3,$xa3
+ vmovdqa64 $xa0,@key[0]
+ vmovdqa64 $xa1,@key[1]
+ vmovdqa64 $xa2,@key[2]
+ vmovdqa64 $xa3,@key[3]
+
+ vpshufd \$0x00,$xb3,$xb0
+ vpshufd \$0x55,$xb3,$xb1
+ vpshufd \$0xaa,$xb3,$xb2
+ vpshufd \$0xff,$xb3,$xb3
+ vmovdqa64 $xb0,@key[4]
+ vmovdqa64 $xb1,@key[5]
+ vmovdqa64 $xb2,@key[6]
+ vmovdqa64 $xb3,@key[7]
+
+ vpshufd \$0x00,$xc3,$xc0
+ vpshufd \$0x55,$xc3,$xc1
+ vpshufd \$0xaa,$xc3,$xc2
+ vpshufd \$0xff,$xc3,$xc3
+ vmovdqa64 $xc0,@key[8]
+ vmovdqa64 $xc1,@key[9]
+ vmovdqa64 $xc2,@key[10]
+ vmovdqa64 $xc3,@key[11]
+
+ vpshufd \$0x00,$xd3,$xd0
+ vpshufd \$0x55,$xd3,$xd1
+ vpshufd \$0xaa,$xd3,$xd2
+ vpshufd \$0xff,$xd3,$xd3
+ vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
+ vmovdqa64 $xd0,@key[12]
+ vmovdqa64 $xd1,@key[13]
+ vmovdqa64 $xd2,@key[14]
+ vmovdqa64 $xd3,@key[15]
+
+ mov \$10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop_outer16x:
+ vpbroadcastd 0(%r10),$xa0 # reload key
+ vpbroadcastd 4(%r10),$xa1
+ vpbroadcastd 8(%r10),$xa2
+ vpbroadcastd 12(%r10),$xa3
+ vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
+ vmovdqa64 @key[4],$xb0
+ vmovdqa64 @key[5],$xb1
+ vmovdqa64 @key[6],$xb2
+ vmovdqa64 @key[7],$xb3
+ vmovdqa64 @key[8],$xc0
+ vmovdqa64 @key[9],$xc1
+ vmovdqa64 @key[10],$xc2
+ vmovdqa64 @key[11],$xc3
+ vmovdqa64 @key[12],$xd0
+ vmovdqa64 @key[13],$xd1
+ vmovdqa64 @key[14],$xd2
+ vmovdqa64 @key[15],$xd3
+
+ vmovdqa64 $xa0,@key[0]
+ vmovdqa64 $xa1,@key[1]
+ vmovdqa64 $xa2,@key[2]
+ vmovdqa64 $xa3,@key[3]
+
+ mov \$10,%eax
+ jmp .Loop16x
+
+.align 32
+.Loop16x:
+___
+ foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
+ foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+ dec %eax
+ jnz .Loop16x
+
+ vpaddd @key[0],$xa0,$xa0 # accumulate key
+ vpaddd @key[1],$xa1,$xa1
+ vpaddd @key[2],$xa2,$xa2
+ vpaddd @key[3],$xa3,$xa3
+
+ vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
+ vpunpckldq $xa3,$xa2,$xt3
+ vpunpckhdq $xa1,$xa0,$xa0
+ vpunpckhdq $xa3,$xa2,$xa2
+ vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
+ vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
+ vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
+ vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
+___
+ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
+$code.=<<___;
+ vpaddd @key[4],$xb0,$xb0
+ vpaddd @key[5],$xb1,$xb1
+ vpaddd @key[6],$xb2,$xb2
+ vpaddd @key[7],$xb3,$xb3
+
+ vpunpckldq $xb1,$xb0,$xt2
+ vpunpckldq $xb3,$xb2,$xt3
+ vpunpckhdq $xb1,$xb0,$xb0
+ vpunpckhdq $xb3,$xb2,$xb2
+ vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
+ vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
+ vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
+ vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
+___
+ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
+$code.=<<___;
+ vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
+ vshufi32x4 \$0xee,$xb0,$xa0,$xb0
+ vshufi32x4 \$0x44,$xb1,$xa1,$xa0
+ vshufi32x4 \$0xee,$xb1,$xa1,$xb1
+ vshufi32x4 \$0x44,$xb2,$xa2,$xa1
+ vshufi32x4 \$0xee,$xb2,$xa2,$xb2
+ vshufi32x4 \$0x44,$xb3,$xa3,$xa2
+ vshufi32x4 \$0xee,$xb3,$xa3,$xb3
+___
+ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
+$code.=<<___;
+ vpaddd @key[8],$xc0,$xc0
+ vpaddd @key[9],$xc1,$xc1
+ vpaddd @key[10],$xc2,$xc2
+ vpaddd @key[11],$xc3,$xc3
+
+ vpunpckldq $xc1,$xc0,$xt2
+ vpunpckldq $xc3,$xc2,$xt3
+ vpunpckhdq $xc1,$xc0,$xc0
+ vpunpckhdq $xc3,$xc2,$xc2
+ vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
+ vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
+ vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
+ vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
+___
+ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
+$code.=<<___;
+ vpaddd @key[12],$xd0,$xd0
+ vpaddd @key[13],$xd1,$xd1
+ vpaddd @key[14],$xd2,$xd2
+ vpaddd @key[15],$xd3,$xd3
+
+ vpunpckldq $xd1,$xd0,$xt2
+ vpunpckldq $xd3,$xd2,$xt3
+ vpunpckhdq $xd1,$xd0,$xd0
+ vpunpckhdq $xd3,$xd2,$xd2
+ vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
+ vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
+ vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
+ vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
+___
+ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
+$code.=<<___;
+ vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
+ vshufi32x4 \$0xee,$xd0,$xc0,$xd0
+ vshufi32x4 \$0x44,$xd1,$xc1,$xc0
+ vshufi32x4 \$0xee,$xd1,$xc1,$xd1
+ vshufi32x4 \$0x44,$xd2,$xc2,$xc1
+ vshufi32x4 \$0xee,$xd2,$xc2,$xd2
+ vshufi32x4 \$0x44,$xd3,$xc3,$xc2
+ vshufi32x4 \$0xee,$xd3,$xc3,$xd3
+___
+ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
+$code.=<<___;
+ vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
+ vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
+ vshufi32x4 \$0x88,$xd0,$xb0,$xc0
+ vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
+ vshufi32x4 \$0x88,$xc1,$xa1,$xt1
+ vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
+ vshufi32x4 \$0x88,$xd1,$xb1,$xc1
+ vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
+ vshufi32x4 \$0x88,$xc2,$xa2,$xt2
+ vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
+ vshufi32x4 \$0x88,$xd2,$xb2,$xc2
+ vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
+ vshufi32x4 \$0x88,$xc3,$xa3,$xt3
+ vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
+ vshufi32x4 \$0x88,$xd3,$xb3,$xc3
+ vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
+___
+ ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
+ ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
+
+ ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
+ $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
+ ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
+$code.=<<___;
+ cmp \$64*16,$len
+ jb .Ltail16x
+
+ vpxord 0x00($inp),$xa0,$xa0 # xor with input
+ vpxord 0x40($inp),$xb0,$xb0
+ vpxord 0x80($inp),$xc0,$xc0
+ vpxord 0xc0($inp),$xd0,$xd0
+ vmovdqu32 $xa0,0x00($out)
+ vmovdqu32 $xb0,0x40($out)
+ vmovdqu32 $xc0,0x80($out)
+ vmovdqu32 $xd0,0xc0($out)
+
+ vpxord 0x100($inp),$xa1,$xa1
+ vpxord 0x140($inp),$xb1,$xb1
+ vpxord 0x180($inp),$xc1,$xc1
+ vpxord 0x1c0($inp),$xd1,$xd1
+ vmovdqu32 $xa1,0x100($out)
+ vmovdqu32 $xb1,0x140($out)
+ vmovdqu32 $xc1,0x180($out)
+ vmovdqu32 $xd1,0x1c0($out)
+
+ vpxord 0x200($inp),$xa2,$xa2
+ vpxord 0x240($inp),$xb2,$xb2
+ vpxord 0x280($inp),$xc2,$xc2
+ vpxord 0x2c0($inp),$xd2,$xd2
+ vmovdqu32 $xa2,0x200($out)
+ vmovdqu32 $xb2,0x240($out)
+ vmovdqu32 $xc2,0x280($out)
+ vmovdqu32 $xd2,0x2c0($out)
+
+ vpxord 0x300($inp),$xa3,$xa3
+ vpxord 0x340($inp),$xb3,$xb3
+ vpxord 0x380($inp),$xc3,$xc3
+ vpxord 0x3c0($inp),$xd3,$xd3
+ lea 0x400($inp),$inp
+ vmovdqu32 $xa3,0x300($out)
+ vmovdqu32 $xb3,0x340($out)
+ vmovdqu32 $xc3,0x380($out)
+ vmovdqu32 $xd3,0x3c0($out)
+ lea 0x400($out),$out
+
+ sub \$64*16,$len
+ jnz .Loop_outer16x
+
+ jmp .Ldone16x
+
+.align 32
+.Ltail16x:
+ xor %r10,%r10
+ sub $inp,$out
+ cmp \$64*1,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xa0,$xa0 # xor with input
+ vmovdqu32 $xa0,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xb0,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*2,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xb0,$xb0
+ vmovdqu32 $xb0,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xc0,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*3,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xc0,$xc0
+ vmovdqu32 $xc0,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xd0,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*4,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xd0,$xd0
+ vmovdqu32 $xd0,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xa1,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*5,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xa1,$xa1
+ vmovdqu32 $xa1,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xb1,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*6,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xb1,$xb1
+ vmovdqu32 $xb1,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xc1,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*7,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xc1,$xc1
+ vmovdqu32 $xc1,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xd1,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*8,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xd1,$xd1
+ vmovdqu32 $xd1,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xa2,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*9,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xa2,$xa2
+ vmovdqu32 $xa2,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xb2,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*10,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xb2,$xb2
+ vmovdqu32 $xb2,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xc2,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*11,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xc2,$xc2
+ vmovdqu32 $xc2,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xd2,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*12,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xd2,$xd2
+ vmovdqu32 $xd2,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xa3,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*13,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xa3,$xa3
+ vmovdqu32 $xa3,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xb3,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*14,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xb3,$xb3
+ vmovdqu32 $xb3,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xc3,$xa0
+ lea 64($inp),$inp
+
+ cmp \$64*15,$len
+ jb .Less_than_64_16x
+ vpxord ($inp),$xc3,$xc3
+ vmovdqu32 $xc3,($out,$inp)
+ je .Ldone16x
+ vmovdqa32 $xd3,$xa0
+ lea 64($inp),$inp
+
+.Less_than_64_16x:
+ vmovdqa32 $xa0,0x00(%rsp)
+ lea ($out,$inp),$out
+ and \$63,$len
+
+.Loop_tail16x:
+ movzb ($inp,%r10),%eax
+ movzb (%rsp,%r10),%ecx
+ lea 1(%r10),%r10
+ xor %ecx,%eax
+ mov %al,-1($out,%r10)
+ dec $len
+ jnz .Loop_tail16x
+
+ vpxord $xa0,$xa0,$xa0
+ vmovdqa32 $xa0,0(%rsp)
+
+.Ldone16x:
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movaps -0xa8(%r9),%xmm6
+ movaps -0x98(%r9),%xmm7
+ movaps -0x88(%r9),%xmm8
+ movaps -0x78(%r9),%xmm9
+ movaps -0x68(%r9),%xmm10
+ movaps -0x58(%r9),%xmm11
+ movaps -0x48(%r9),%xmm12
+ movaps -0x38(%r9),%xmm13
+ movaps -0x28(%r9),%xmm14
+ movaps -0x18(%r9),%xmm15
+___
+$code.=<<___;
+ lea (%r9),%rsp
+.L16x_epilogue:
+ ret
+.size ChaCha20_16x,.-ChaCha20_16x
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ lea .Lctr32_body(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea .Lno_data(%rip),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea 64+24+48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size se_handler,.-se_handler
+
+.type ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 192($context),%rax # pull context->R9
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea -0x28(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$4,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lcommon_seh_tail
+.size ssse3_handler,.-ssse3_handler
+
+.type full_handler,\@abi-omnipotent
+.align 16
+full_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 192($context),%rax # pull context->R9
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ lea -0xa8(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+ jmp .Lcommon_seh_tail
+.size full_handler,.-full_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_ChaCha20_ctr32
+ .rva .LSEH_end_ChaCha20_ctr32
+ .rva .LSEH_info_ChaCha20_ctr32
+
+ .rva .LSEH_begin_ChaCha20_ssse3
+ .rva .LSEH_end_ChaCha20_ssse3
+ .rva .LSEH_info_ChaCha20_ssse3
+
+ .rva .LSEH_begin_ChaCha20_4x
+ .rva .LSEH_end_ChaCha20_4x
+ .rva .LSEH_info_ChaCha20_4x
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_ChaCha20_8x
+ .rva .LSEH_end_ChaCha20_8x
+ .rva .LSEH_info_ChaCha20_8x
+___
+$code.=<<___ if ($avx>2);
+ .rva .LSEH_begin_ChaCha20_avx512
+ .rva .LSEH_end_ChaCha20_avx512
+ .rva .LSEH_info_ChaCha20_avx512
+
+ .rva .LSEH_begin_ChaCha20_16x
+ .rva .LSEH_end_ChaCha20_16x
+ .rva .LSEH_info_ChaCha20_16x
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_ChaCha20_ctr32:
+ .byte 9,0,0,0
+ .rva se_handler
+
+.LSEH_info_ChaCha20_ssse3:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lssse3_body,.Lssse3_epilogue
+
+.LSEH_info_ChaCha20_4x:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .L4x_body,.L4x_epilogue
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_ChaCha20_8x:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .L8x_body,.L8x_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_ChaCha20_avx512:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
+
+.LSEH_info_ChaCha20_16x:
+ .byte 9,0,0,0
+ .rva full_handler
+ .rva .L16x_body,.L16x_epilogue # HandlerData[]
+___
+}
+
foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
+ s/\`([^\`]*)\`/eval $1/ge;
- s/%x#%y/%x/go;
+ s/%x#%[yz]/%x/g; # "down-shift"
print $_,"\n";
}
diff --git a/src/crypto/chacha/chacha_test.cc b/src/crypto/chacha/chacha_test.cc
index 6bfb03eb..a40653fa 100644
--- a/src/crypto/chacha/chacha_test.cc
+++ b/src/crypto/chacha/chacha_test.cc
@@ -18,10 +18,13 @@
#include <memory>
+#include <gtest/gtest.h>
+
#include <openssl/crypto.h>
#include <openssl/chacha.h>
#include "../internal.h"
+#include "../test/test_util.h"
static const uint8_t kKey[32] = {
@@ -216,35 +219,18 @@ static const uint8_t kOutput[] = {
static_assert(sizeof(kInput) == sizeof(kOutput),
"Input and output lengths don't match.");
-static bool TestChaCha20(size_t len) {
- std::unique_ptr<uint8_t[]> buf(new uint8_t[len]);
- CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter);
- if (OPENSSL_memcmp(buf.get(), kOutput, len) != 0) {
- fprintf(stderr, "Mismatch at length %zu.\n", len);
- return false;
- }
-
- // Test in-place.
- OPENSSL_memcpy(buf.get(), kInput, len);
- CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter);
- if (OPENSSL_memcmp(buf.get(), kOutput, len) != 0) {
- fprintf(stderr, "Mismatch at length %zu, in-place.\n", len);
- return false;
- }
-
- return true;
-}
-
-int main(int argc, char **argv) {
- CRYPTO_library_init();
-
+TEST(ChaChaTest, TestVector) {
// Run the test with the test vector at all lengths.
for (size_t len = 0; len <= sizeof(kInput); len++) {
- if (!TestChaCha20(len)) {
- return 1;
- }
- }
+ SCOPED_TRACE(len);
- printf("PASS\n");
- return 0;
+ std::unique_ptr<uint8_t[]> buf(new uint8_t[len]);
+ CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter);
+ EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len));
+
+ // Test the in-place version.
+ OPENSSL_memcpy(buf.get(), kInput, len);
+ CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter);
+ EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len));
+ }
}
diff --git a/src/crypto/cipher/cipher.c b/src/crypto/cipher/cipher.c
index ae045aef..e46e43ef 100644
--- a/src/crypto/cipher/cipher.c
+++ b/src/crypto/cipher/cipher.c
@@ -132,6 +132,7 @@ int EVP_CIPHER_CTX_copy(EVP_CIPHER_CTX *out, const EVP_CIPHER_CTX *in) {
if (in->cipher_data && in->cipher->ctx_size) {
out->cipher_data = OPENSSL_malloc(in->cipher->ctx_size);
if (!out->cipher_data) {
+ out->cipher = NULL;
OPENSSL_PUT_ERROR(CIPHER, ERR_R_MALLOC_FAILURE);
return 0;
}
@@ -139,7 +140,10 @@ int EVP_CIPHER_CTX_copy(EVP_CIPHER_CTX *out, const EVP_CIPHER_CTX *in) {
}
if (in->cipher->flags & EVP_CIPH_CUSTOM_COPY) {
- return in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out);
+ if (!in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out)) {
+ out->cipher = NULL;
+ return 0;
+ }
}
return 1;
diff --git a/src/crypto/ec/CMakeLists.txt b/src/crypto/ec/CMakeLists.txt
index a54075c3..75dccec8 100644
--- a/src/crypto/ec/CMakeLists.txt
+++ b/src/crypto/ec/CMakeLists.txt
@@ -39,14 +39,6 @@ add_executable(
)
add_executable(
- ec_test
-
- ec_test.cc
-
- $<TARGET_OBJECTS:test_support>
-)
-
-add_executable(
p256-x86_64_test
p256-x86_64_test.cc
@@ -55,6 +47,5 @@ add_executable(
)
target_link_libraries(example_mul crypto)
-target_link_libraries(ec_test crypto)
target_link_libraries(p256-x86_64_test crypto)
-add_dependencies(all_tests example_mul ec_test p256-x86_64_test)
+add_dependencies(all_tests example_mul p256-x86_64_test)
diff --git a/src/crypto/ec/asm/p256-x86_64-asm.pl b/src/crypto/ec/asm/p256-x86_64-asm.pl
index 3cd7b01f..517c506d 100755
--- a/src/crypto/ec/asm/p256-x86_64-asm.pl
+++ b/src/crypto/ec/asm/p256-x86_64-asm.pl
@@ -289,7 +289,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc0
########################################################################
- # Second reduction step
+ # Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
@@ -336,7 +336,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc1
########################################################################
- # Third reduction step
+ # Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
@@ -383,7 +383,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc2
########################################################################
- # Final reduction step
+ # Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
@@ -396,7 +396,7 @@ __ecp_nistz256_mul_montq:
mov $acc5, $t1
adc \$0, $acc2
- ########################################################################
+ ########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
@@ -1649,7 +1649,7 @@ $code.=<<___;
movq %xmm1, $r_ptr
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
___
-{
+{
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
# operate in 4-5-6-7 "name space" that matches squaring output
#
@@ -1738,7 +1738,7 @@ $code.=<<___;
lea $M(%rsp), $b_ptr
mov $acc4, $acc6 # harmonize sub output and mul input
xor %ecx, %ecx
- mov $acc4, $S+8*0(%rsp) # have to save:-(
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc5, $acc2
mov $acc5, $S+8*1(%rsp)
cmovz $acc0, $acc3
diff --git a/src/crypto/ec/ec_test.cc b/src/crypto/ec/ec_test.cc
index 31619b1e..02b9ef20 100644
--- a/src/crypto/ec/ec_test.cc
+++ b/src/crypto/ec/ec_test.cc
@@ -17,6 +17,8 @@
#include <vector>
+#include <gtest/gtest.h>
+
#include <openssl/bn.h>
#include <openssl/bytestring.h>
#include <openssl/crypto.h>
@@ -24,6 +26,9 @@
#include <openssl/err.h>
#include <openssl/mem.h>
#include <openssl/nid.h>
+#include <openssl/obj.h>
+
+#include "../test/test_util.h"
// kECKeyWithoutPublic is an ECPrivateKey with the optional publicKey field
@@ -123,201 +128,75 @@ static bool EncodeECPrivateKey(std::vector<uint8_t> *out, const EC_KEY *key) {
return true;
}
-static bool Testd2i_ECPrivateKey() {
- bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeyWithoutPublic,
- sizeof(kECKeyWithoutPublic));
- if (!key) {
- fprintf(stderr, "Failed to parse private key.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
+TEST(ECTest, Encoding) {
+ bssl::UniquePtr<EC_KEY> key =
+ DecodeECPrivateKey(kECKeyWithoutPublic, sizeof(kECKeyWithoutPublic));
+ ASSERT_TRUE(key);
+ // Test that the encoding round-trips.
std::vector<uint8_t> out;
- if (!EncodeECPrivateKey(&out, key.get())) {
- fprintf(stderr, "Failed to serialize private key.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (std::vector<uint8_t>(kECKeyWithoutPublic,
- kECKeyWithoutPublic + sizeof(kECKeyWithoutPublic)) !=
- out) {
- fprintf(stderr, "Serialisation of key doesn't match original.\n");
- return false;
- }
+ ASSERT_TRUE(EncodeECPrivateKey(&out, key.get()));
+ EXPECT_EQ(Bytes(kECKeyWithoutPublic), Bytes(out.data(), out.size()));
const EC_POINT *pub_key = EC_KEY_get0_public_key(key.get());
- if (pub_key == NULL) {
- fprintf(stderr, "Public key missing.\n");
- return false;
- }
+ ASSERT_TRUE(pub_key) << "Public key missing";
bssl::UniquePtr<BIGNUM> x(BN_new());
bssl::UniquePtr<BIGNUM> y(BN_new());
- if (!x || !y) {
- return false;
- }
- if (!EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
- pub_key, x.get(), y.get(), NULL)) {
- fprintf(stderr, "Failed to get public key in affine coordinates.\n");
- return false;
- }
+ ASSERT_TRUE(x);
+ ASSERT_TRUE(y);
+ ASSERT_TRUE(EC_POINT_get_affine_coordinates_GFp(
+ EC_KEY_get0_group(key.get()), pub_key, x.get(), y.get(), NULL));
bssl::UniquePtr<char> x_hex(BN_bn2hex(x.get()));
bssl::UniquePtr<char> y_hex(BN_bn2hex(y.get()));
- if (!x_hex || !y_hex) {
- return false;
- }
- if (0 != strcmp(
- x_hex.get(),
- "c81561ecf2e54edefe6617db1c7a34a70744ddb261f269b83dacfcd2ade5a681") ||
- 0 != strcmp(
- y_hex.get(),
- "e0e2afa3f9b6abe4c698ef6495f1be49a3196c5056acb3763fe4507eec596e88")) {
- fprintf(stderr, "Incorrect public key: %s %s\n", x_hex.get(), y_hex.get());
- return false;
- }
-
- return true;
+ ASSERT_TRUE(x_hex);
+ ASSERT_TRUE(y_hex);
+
+ EXPECT_STREQ(
+ "c81561ecf2e54edefe6617db1c7a34a70744ddb261f269b83dacfcd2ade5a681",
+ x_hex.get());
+ EXPECT_STREQ(
+ "e0e2afa3f9b6abe4c698ef6495f1be49a3196c5056acb3763fe4507eec596e88",
+ y_hex.get());
}
-static bool TestZeroPadding() {
+TEST(ECTest, ZeroPadding) {
// Check that the correct encoding round-trips.
- bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeyWithZeros,
- sizeof(kECKeyWithZeros));
+ bssl::UniquePtr<EC_KEY> key =
+ DecodeECPrivateKey(kECKeyWithZeros, sizeof(kECKeyWithZeros));
+ ASSERT_TRUE(key);
std::vector<uint8_t> out;
- if (!key || !EncodeECPrivateKey(&out, key.get())) {
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (std::vector<uint8_t>(kECKeyWithZeros,
- kECKeyWithZeros + sizeof(kECKeyWithZeros)) != out) {
- fprintf(stderr, "Serialisation of key was incorrect.\n");
- return false;
- }
+ EXPECT_TRUE(EncodeECPrivateKey(&out, key.get()));
+ EXPECT_EQ(Bytes(kECKeyWithZeros), Bytes(out.data(), out.size()));
// Keys without leading zeros also parse, but they encode correctly.
key = DecodeECPrivateKey(kECKeyMissingZeros, sizeof(kECKeyMissingZeros));
- if (!key || !EncodeECPrivateKey(&out, key.get())) {
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (std::vector<uint8_t>(kECKeyWithZeros,
- kECKeyWithZeros + sizeof(kECKeyWithZeros)) != out) {
- fprintf(stderr, "Serialisation of key was incorrect.\n");
- return false;
- }
-
- return true;
+ ASSERT_TRUE(key);
+ EXPECT_TRUE(EncodeECPrivateKey(&out, key.get()));
+ EXPECT_EQ(Bytes(kECKeyWithZeros), Bytes(out.data(), out.size()));
}
-static bool TestSpecifiedCurve() {
+TEST(ECTest, SpecifiedCurve) {
// Test keys with specified curves may be decoded.
bssl::UniquePtr<EC_KEY> key =
DecodeECPrivateKey(kECKeySpecifiedCurve, sizeof(kECKeySpecifiedCurve));
- if (!key) {
- ERR_print_errors_fp(stderr);
- return false;
- }
+ ASSERT_TRUE(key);
// The group should have been interpreted as P-256.
- if (EC_GROUP_get_curve_name(EC_KEY_get0_group(key.get())) !=
- NID_X9_62_prime256v1) {
- fprintf(stderr, "Curve name incorrect.\n");
- return false;
- }
+ EXPECT_EQ(NID_X9_62_prime256v1,
+ EC_GROUP_get_curve_name(EC_KEY_get0_group(key.get())));
// Encoding the key should still use named form.
std::vector<uint8_t> out;
- if (!EncodeECPrivateKey(&out, key.get())) {
- ERR_print_errors_fp(stderr);
- return false;
- }
- if (std::vector<uint8_t>(kECKeyWithoutPublic,
- kECKeyWithoutPublic + sizeof(kECKeyWithoutPublic)) !=
- out) {
- fprintf(stderr, "Serialisation of key was incorrect.\n");
- return false;
- }
-
- return true;
-}
-
-static bool TestSetAffine(const int nid) {
- bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
- if (!key) {
- return false;
- }
-
- const EC_GROUP *const group = EC_KEY_get0_group(key.get());
-
- if (!EC_KEY_generate_key(key.get())) {
- fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (!EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()),
- nullptr)) {
- fprintf(stderr, "generated point is not on curve with nid %d", nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- bssl::UniquePtr<BIGNUM> x(BN_new());
- bssl::UniquePtr<BIGNUM> y(BN_new());
- if (!EC_POINT_get_affine_coordinates_GFp(group,
- EC_KEY_get0_public_key(key.get()),
- x.get(), y.get(), nullptr)) {
- fprintf(stderr, "EC_POINT_get_affine_coordinates_GFp failed with nid %d\n",
- nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group));
- if (!point) {
- return false;
- }
-
- if (!EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(), y.get(),
- nullptr)) {
- fprintf(stderr, "EC_POINT_set_affine_coordinates_GFp failed with nid %d\n",
- nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- // Subtract one from |y| to make the point no longer on the curve.
- if (!BN_sub(y.get(), y.get(), BN_value_one())) {
- return false;
- }
-
- bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group));
- if (!invalid_point) {
- return false;
- }
-
- if (EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(), x.get(),
- y.get(), nullptr)) {
- fprintf(stderr,
- "EC_POINT_set_affine_coordinates_GFp succeeded with invalid "
- "coordinates with nid %d\n",
- nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- return true;
+ EXPECT_TRUE(EncodeECPrivateKey(&out, key.get()));
+ EXPECT_EQ(Bytes(kECKeyWithoutPublic), Bytes(out.data(), out.size()));
}
-static bool TestArbitraryCurve() {
+TEST(ECTest, ArbitraryCurve) {
// Make a P-256 key and extract the affine coordinates.
bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_X9_62_prime256v1));
- if (!key || !EC_KEY_generate_key(key.get())) {
- return false;
- }
+ ASSERT_TRUE(key);
+ ASSERT_TRUE(EC_KEY_generate_key(key.get()));
// Make an arbitrary curve which is identical to P-256.
static const uint8_t kP[] = {
@@ -351,186 +230,161 @@ static bool TestArbitraryCurve() {
0x9e, 0x84, 0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51,
};
bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
+ ASSERT_TRUE(ctx);
bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
+ ASSERT_TRUE(p);
bssl::UniquePtr<BIGNUM> a(BN_bin2bn(kA, sizeof(kA), nullptr));
+ ASSERT_TRUE(a);
bssl::UniquePtr<BIGNUM> b(BN_bin2bn(kB, sizeof(kB), nullptr));
+ ASSERT_TRUE(b);
bssl::UniquePtr<BIGNUM> gx(BN_bin2bn(kX, sizeof(kX), nullptr));
+ ASSERT_TRUE(gx);
bssl::UniquePtr<BIGNUM> gy(BN_bin2bn(kY, sizeof(kY), nullptr));
+ ASSERT_TRUE(gy);
bssl::UniquePtr<BIGNUM> order(BN_bin2bn(kOrder, sizeof(kOrder), nullptr));
- bssl::UniquePtr<BIGNUM> cofactor(BN_new());
- if (!ctx || !p || !a || !b || !gx || !gy || !order || !cofactor ||
- !BN_set_word(cofactor.get(), 1)) {
- return false;
- }
+ ASSERT_TRUE(order);
bssl::UniquePtr<EC_GROUP> group(
EC_GROUP_new_curve_GFp(p.get(), a.get(), b.get(), ctx.get()));
- if (!group) {
- return false;
- }
+ ASSERT_TRUE(group);
bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get()));
- if (!generator ||
- !EC_POINT_set_affine_coordinates_GFp(group.get(), generator.get(),
- gx.get(), gy.get(), ctx.get()) ||
- !EC_GROUP_set_generator(group.get(), generator.get(), order.get(),
- cofactor.get())) {
- return false;
- }
+ ASSERT_TRUE(generator);
+ ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp(
+ group.get(), generator.get(), gx.get(), gy.get(), ctx.get()));
+ ASSERT_TRUE(EC_GROUP_set_generator(group.get(), generator.get(), order.get(),
+ BN_value_one()));
// |group| should not have a curve name.
- if (EC_GROUP_get_curve_name(group.get()) != NID_undef) {
- return false;
- }
+ EXPECT_EQ(NID_undef, EC_GROUP_get_curve_name(group.get()));
// Copy |key| to |key2| using |group|.
bssl::UniquePtr<EC_KEY> key2(EC_KEY_new());
+ ASSERT_TRUE(key2);
bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+ ASSERT_TRUE(point);
bssl::UniquePtr<BIGNUM> x(BN_new()), y(BN_new());
- if (!key2 || !point || !x || !y ||
- !EC_KEY_set_group(key2.get(), group.get()) ||
- !EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())) ||
- !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
- EC_KEY_get0_public_key(key.get()),
- x.get(), y.get(), nullptr) ||
- !EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(), x.get(),
- y.get(), nullptr) ||
- !EC_KEY_set_public_key(key2.get(), point.get())) {
- fprintf(stderr, "Could not copy key.\n");
- return false;
- }
+ ASSERT_TRUE(x);
+ ASSERT_TRUE(EC_KEY_set_group(key2.get(), group.get()));
+ ASSERT_TRUE(
+ EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())));
+ ASSERT_TRUE(EC_POINT_get_affine_coordinates_GFp(
+ EC_KEY_get0_group(key.get()), EC_KEY_get0_public_key(key.get()), x.get(),
+ y.get(), nullptr));
+ ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(),
+ x.get(), y.get(), nullptr));
+ ASSERT_TRUE(EC_KEY_set_public_key(key2.get(), point.get()));
// The key must be valid according to the new group too.
- if (!EC_KEY_check_key(key2.get())) {
- fprintf(stderr, "Copied key is not valid.\n");
- return false;
- }
-
- return true;
+ EXPECT_TRUE(EC_KEY_check_key(key2.get()));
}
-static bool TestAddingEqualPoints(int nid) {
- bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
- if (!key) {
- return false;
- }
+class ECCurveTest : public testing::TestWithParam<EC_builtin_curve> {};
+
+TEST_P(ECCurveTest, SetAffine) {
+ // Generate an EC_KEY.
+ bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid));
+ ASSERT_TRUE(key);
+ ASSERT_TRUE(EC_KEY_generate_key(key.get()));
const EC_GROUP *const group = EC_KEY_get0_group(key.get());
+ EXPECT_TRUE(
+ EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()), nullptr));
- if (!EC_KEY_generate_key(key.get())) {
- fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
+ // Get the public key's coordinates.
+ bssl::UniquePtr<BIGNUM> x(BN_new());
+ ASSERT_TRUE(x);
+ bssl::UniquePtr<BIGNUM> y(BN_new());
+ ASSERT_TRUE(y);
+ EXPECT_TRUE(EC_POINT_get_affine_coordinates_GFp(
+ group, EC_KEY_get0_public_key(key.get()), x.get(), y.get(), nullptr));
+
+ // Points on the curve should be accepted.
+ auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group));
+ ASSERT_TRUE(point);
+ EXPECT_TRUE(EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(),
+ y.get(), nullptr));
+
+ // Subtract one from |y| to make the point no longer on the curve.
+ EXPECT_TRUE(BN_sub(y.get(), y.get(), BN_value_one()));
+
+ // Points not on the curve should be rejected.
+ bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group));
+ ASSERT_TRUE(invalid_point);
+ EXPECT_FALSE(EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(),
+ x.get(), y.get(), nullptr));
+}
+
+TEST_P(ECCurveTest, AddingEqualPoints) {
+ bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid));
+ ASSERT_TRUE(key);
+ ASSERT_TRUE(EC_KEY_generate_key(key.get()));
+
+ const EC_GROUP *const group = EC_KEY_get0_group(key.get());
bssl::UniquePtr<EC_POINT> p1(EC_POINT_new(group));
- bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group));
- bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group));
- bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group));
- if (!p1 || !p2 || !double_p1 || !p1_plus_p2) {
- return false;
- }
+ ASSERT_TRUE(p1);
+ ASSERT_TRUE(EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())));
- if (!EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())) ||
- !EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get()))) {
- fprintf(stderr, "EC_POINT_COPY failed with nid %d\n", nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
+ bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group));
+ ASSERT_TRUE(p2);
+ ASSERT_TRUE(EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get())));
+ bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group));
+ ASSERT_TRUE(double_p1);
bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
- if (!ctx) {
- return false;
- }
-
- if (!EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()) ||
- !EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get())) {
- fprintf(stderr, "Point operation failed with nid %d\n", nid);
- ERR_print_errors_fp(stderr);
- return false;
- }
+ ASSERT_TRUE(ctx);
+ ASSERT_TRUE(EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()));
- if (EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()) != 0) {
- fprintf(stderr, "A+A != 2A for nid %d", nid);
- return false;
- }
+ bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group));
+ ASSERT_TRUE(p1_plus_p2);
+ ASSERT_TRUE(
+ EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get()));
- return true;
+ EXPECT_EQ(0,
+ EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()))
+ << "A+A != 2A";
}
-static bool TestMulZero(int nid) {
- bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(nid));
- if (!group) {
- return false;
- }
+TEST_P(ECCurveTest, MulZero) {
+ bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
+ ASSERT_TRUE(group);
bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+ ASSERT_TRUE(point);
bssl::UniquePtr<BIGNUM> zero(BN_new());
- if (!point || !zero) {
- return false;
- }
-
+ ASSERT_TRUE(zero);
BN_zero(zero.get());
- if (!EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr, nullptr,
- nullptr)) {
- return false;
- }
+ ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr,
+ nullptr, nullptr));
- if (!EC_POINT_is_at_infinity(group.get(), point.get())) {
- fprintf(stderr, "g * 0 did not return point at infinity.\n");
- return false;
- }
+ EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+ << "g * 0 did not return point at infinity.";
// Test that zero times an arbitrary point is also infinity. The generator is
// used as the arbitrary point.
bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get()));
- bssl::UniquePtr<BIGNUM> one(BN_new());
- if (!generator ||
- !one ||
- !BN_one(one.get()) ||
- !EC_POINT_mul(group.get(), generator.get(), one.get(), nullptr, nullptr,
- nullptr) ||
- !EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(),
- zero.get(), nullptr)) {
- return false;
- }
-
- if (!EC_POINT_is_at_infinity(group.get(), point.get())) {
- fprintf(stderr, "p * 0 did not return point at infinity.\n");
- return false;
- }
-
- return true;
+ ASSERT_TRUE(generator);
+ ASSERT_TRUE(EC_POINT_mul(group.get(), generator.get(), BN_value_one(),
+ nullptr, nullptr, nullptr));
+ ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(),
+ zero.get(), nullptr));
+
+ EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+ << "p * 0 did not return point at infinity.";
}
-static bool ForEachCurve(bool (*test_func)(int nid)) {
+static std::vector<EC_builtin_curve> AllCurves() {
const size_t num_curves = EC_get_builtin_curves(nullptr, 0);
std::vector<EC_builtin_curve> curves(num_curves);
EC_get_builtin_curves(curves.data(), num_curves);
-
- for (const auto& curve : curves) {
- if (!test_func(curve.nid)) {
- fprintf(stderr, "Test failed for %s\n", curve.comment);
- return false;
- }
- }
-
- return true;
+ return curves;
}
-int main() {
- CRYPTO_library_init();
-
- if (!Testd2i_ECPrivateKey() ||
- !TestZeroPadding() ||
- !TestSpecifiedCurve() ||
- !ForEachCurve(TestSetAffine) ||
- !ForEachCurve(TestAddingEqualPoints) ||
- !ForEachCurve(TestMulZero) ||
- !TestArbitraryCurve()) {
- fprintf(stderr, "failed\n");
- return 1;
- }
-
- printf("PASS\n");
- return 0;
+static std::string CurveToString(
+ const testing::TestParamInfo<EC_builtin_curve> &params) {
+ // The comment field contains characters GTest rejects, so use the OBJ name.
+ return OBJ_nid2sn(params.param.nid);
}
+
+INSTANTIATE_TEST_CASE_P(, ECCurveTest, testing::ValuesIn(AllCurves()),
+ CurveToString);
diff --git a/src/crypto/ecdsa/ecdsa.c b/src/crypto/ecdsa/ecdsa.c
index 34320819..e1a0525f 100644
--- a/src/crypto/ecdsa/ecdsa.c
+++ b/src/crypto/ecdsa/ecdsa.c
@@ -66,9 +66,10 @@
int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig,
- unsigned int *sig_len, EC_KEY *eckey) {
+ unsigned int *sig_len, const EC_KEY *eckey) {
if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
- return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len, eckey);
+ return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len,
+ (EC_KEY*) eckey /* cast away const */);
}
return ECDSA_sign_ex(type, digest, digest_len, sig, sig_len, NULL, NULL,
@@ -76,7 +77,7 @@ int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig,
}
int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len,
- const uint8_t *sig, size_t sig_len, EC_KEY *eckey) {
+ const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) {
ECDSA_SIG *s;
int ret = 0;
uint8_t *der = NULL;
@@ -133,12 +134,12 @@ static int digest_to_bn(BIGNUM *out, const uint8_t *digest, size_t digest_len,
}
ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
- EC_KEY *key) {
+ const EC_KEY *key) {
return ECDSA_do_sign_ex(digest, digest_len, NULL, NULL, key);
}
int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
- const ECDSA_SIG *sig, EC_KEY *eckey) {
+ const ECDSA_SIG *sig, const EC_KEY *eckey) {
int ret = 0;
BN_CTX *ctx;
BIGNUM *u1, *u2, *m, *X;
@@ -224,7 +225,7 @@ err:
return ret;
}
-static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
+static int ecdsa_sign_setup(const EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
BIGNUM **rp, const uint8_t *digest,
size_t digest_len) {
BN_CTX *ctx = NULL;
@@ -338,13 +339,14 @@ err:
return ret;
}
-int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv, BIGNUM **rp) {
+int ECDSA_sign_setup(const EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv,
+ BIGNUM **rp) {
return ecdsa_sign_setup(eckey, ctx, kinv, rp, NULL, 0);
}
ECDSA_SIG *ECDSA_do_sign_ex(const uint8_t *digest, size_t digest_len,
const BIGNUM *in_kinv, const BIGNUM *in_r,
- EC_KEY *eckey) {
+ const EC_KEY *eckey) {
int ok = 0;
BIGNUM *kinv = NULL, *s, *m = NULL, *tmp = NULL;
const BIGNUM *ckinv;
@@ -441,7 +443,7 @@ err:
int ECDSA_sign_ex(int type, const uint8_t *digest, size_t digest_len,
uint8_t *sig, unsigned int *sig_len, const BIGNUM *kinv,
- const BIGNUM *r, EC_KEY *eckey) {
+ const BIGNUM *r, const EC_KEY *eckey) {
int ret = 0;
ECDSA_SIG *s = NULL;
diff --git a/src/crypto/evp/evp_ctx.c b/src/crypto/evp/evp_ctx.c
index 905aae91..a17a8ccc 100644
--- a/src/crypto/evp/evp_ctx.c
+++ b/src/crypto/evp/evp_ctx.c
@@ -148,48 +148,40 @@ void EVP_PKEY_CTX_free(EVP_PKEY_CTX *ctx) {
OPENSSL_free(ctx);
}
-EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *pctx) {
- EVP_PKEY_CTX *rctx;
-
- if (!pctx->pmeth || !pctx->pmeth->copy) {
+EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *ctx) {
+ if (!ctx->pmeth || !ctx->pmeth->copy) {
return NULL;
}
- rctx = OPENSSL_malloc(sizeof(EVP_PKEY_CTX));
- if (!rctx) {
+ EVP_PKEY_CTX *ret = OPENSSL_malloc(sizeof(EVP_PKEY_CTX));
+ if (!ret) {
return NULL;
}
- OPENSSL_memset(rctx, 0, sizeof(EVP_PKEY_CTX));
+ OPENSSL_memset(ret, 0, sizeof(EVP_PKEY_CTX));
- rctx->pmeth = pctx->pmeth;
- rctx->engine = pctx->engine;
- rctx->operation = pctx->operation;
+ ret->pmeth = ctx->pmeth;
+ ret->engine = ctx->engine;
+ ret->operation = ctx->operation;
- if (pctx->pkey) {
- EVP_PKEY_up_ref(pctx->pkey);
- rctx->pkey = pctx->pkey;
- if (rctx->pkey == NULL) {
- goto err;
- }
+ if (ctx->pkey != NULL) {
+ EVP_PKEY_up_ref(ctx->pkey);
+ ret->pkey = ctx->pkey;
}
- if (pctx->peerkey) {
- EVP_PKEY_up_ref(pctx->peerkey);
- rctx->peerkey = pctx->peerkey;
- if (rctx->peerkey == NULL) {
- goto err;
- }
+ if (ctx->peerkey != NULL) {
+ EVP_PKEY_up_ref(ctx->peerkey);
+ ret->peerkey = ctx->peerkey;
}
- if (pctx->pmeth->copy(rctx, pctx) > 0) {
- return rctx;
+ if (ctx->pmeth->copy(ret, ctx) <= 0) {
+ ret->pmeth = NULL;
+ EVP_PKEY_CTX_free(ret);
+ OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP);
+ return NULL;
}
-err:
- EVP_PKEY_CTX_free(rctx);
- OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP);
- return NULL;
+ return ret;
}
EVP_PKEY *EVP_PKEY_CTX_get0_pkey(EVP_PKEY_CTX *ctx) { return ctx->pkey; }
diff --git a/src/crypto/md5/asm/md5-586.pl b/src/crypto/md5/asm/md5-586.pl
index a237b0cd..a032d9ba 100644
--- a/src/crypto/md5/asm/md5-586.pl
+++ b/src/crypto/md5/asm/md5-586.pl
@@ -50,7 +50,7 @@ sub R0
local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
&mov($tmp1,$C) if $pos < 0;
- &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
+ &mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
# body proper
diff --git a/src/crypto/modes/asm/aesni-gcm-x86_64.pl b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
index e329741c..139014fa 100644
--- a/src/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -22,10 +22,11 @@
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
# pressure with notable relative improvement, achieving 1.0 cycle per
-# byte processed with 128-bit key on Haswell processor, and 0.74 -
-# on Broadwell. [Mentioned results are raw profiled measurements for
-# favourable packet size, one divisible by 96. Applications using the
-# EVP interface will observe a few percent worse performance.]
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
diff --git a/src/crypto/modes/asm/ghash-armv4.pl b/src/crypto/modes/asm/ghash-armv4.pl
index 299eedcb..1a03251e 100644
--- a/src/crypto/modes/asm/ghash-armv4.pl
+++ b/src/crypto/modes/asm/ghash-armv4.pl
@@ -47,7 +47,7 @@
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
-#
+#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
@@ -486,7 +486,7 @@ $code.=<<___;
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
- sub $Xi,#16
+ sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl
index 182c29a3..d3a79e14 100644
--- a/src/crypto/modes/asm/ghash-x86.pl
+++ b/src/crypto/modes/asm/ghash-x86.pl
@@ -88,7 +88,7 @@
# where Tproc is time required for Karatsuba pre- and post-processing,
# is more realistic estimate. In this case it gives ... 1.91 cycles.
# Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
+# and one of the two multiplications the performance should be between
# 1.91 and 2.16. As already mentioned, this implementation processes
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
# - in 2.02. x86_64 performance is better, because larger register
@@ -487,7 +487,7 @@ sub mmx_loop() {
&pxor ($red[1],$red[1]);
&pxor ($red[2],$red[2]);
- # Just like in "May" verson modulo-schedule for critical path in
+ # Just like in "May" version modulo-schedule for critical path in
# 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
# is scheduled so late that rem_8bit[] has to be shifted *right*
# by 16, which is why last argument to pinsrw is 2, which
@@ -576,7 +576,7 @@ sub mmx_loop() {
&bswap ($dat);
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
&bswap ("ebx");
-
+
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
&jne (&label("outer"));
}
@@ -680,7 +680,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T1,8); #
+ &psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
@@ -850,7 +850,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
- &psrldq ($T1,8); #
+ &psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);
@@ -913,7 +913,7 @@ my ($Xhi,$Xi) = @_;
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("gcm_ghash_clmul");
-} else { # Algorith 5. Kept for reference purposes.
+} else { # Algorithm 5. Kept for reference purposes.
sub reduction_alg5 { # 19/16 times faster than Intel version
my ($Xhi,$Xi)=@_;
diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl
index d7471e27..0e6e3489 100644
--- a/src/crypto/modes/asm/ghash-x86_64.pl
+++ b/src/crypto/modes/asm/ghash-x86_64.pl
@@ -64,8 +64,10 @@
# Ivy Bridge 1.80(+7%)
# Haswell 0.55(+93%) (if system doesn't support AVX)
# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
+# Goldmont 1.08(+24%)
# March 2013
#
@@ -74,8 +76,8 @@
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
# sub-optimally in comparison to above mentioned version. But thanks
# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
@@ -217,8 +219,12 @@ $code=<<___;
.align 16
gcm_gmult_4bit:
push %rbx
- push %rbp # %rbp and %r12 are pushed exclusively in
+ push %rbp # %rbp and others are pushed exclusively in
push %r12 # order to reuse Win64 exception handler...
+ push %r13
+ push %r14
+ push %r15
+ sub \$280,%rsp
.Lgmult_prologue:
movzb 15($Xi),$Zlo
@@ -229,8 +235,9 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- mov 16(%rsp),%rbx
- lea 24(%rsp),%rsp
+ lea 280+48(%rsp),%rsi
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lgmult_epilogue:
ret
.size gcm_gmult_4bit,.-gcm_gmult_4bit
@@ -380,14 +387,14 @@ $code.=<<___;
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- lea 280(%rsp),%rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ lea 280+48(%rsp),%rsi
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea 0(%rsi),%rsp
.Lghash_epilogue:
ret
.size gcm_ghash_4bit,.-gcm_ghash_4bit
@@ -449,7 +456,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
@@ -563,7 +570,7 @@ ___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
- # no dependency between the two multiplications...
+ # no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
@@ -738,7 +745,7 @@ $code.=<<___;
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
@@ -874,7 +881,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
@@ -1628,14 +1635,20 @@ se_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
- lea 24(%rax),%rax # adjust "rsp"
+ lea 48+280(%rax),%rax # adjust "rsp"
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi
diff --git a/src/crypto/perlasm/ppc-xlate.pl b/src/crypto/perlasm/ppc-xlate.pl
index 55b02bca..de796d73 100644
--- a/src/crypto/perlasm/ppc-xlate.pl
+++ b/src/crypto/perlasm/ppc-xlate.pl
@@ -36,7 +36,7 @@ my $globl = sub {
my $ret;
$name =~ s|^\.||;
-
+
SWITCH: for ($flavour) {
/aix/ && do { if (!$$type) {
$$type = "\@function";
diff --git a/src/crypto/perlasm/readme b/src/crypto/perlasm/readme
index 648537b9..57d2083c 100644
--- a/src/crypto/perlasm/readme
+++ b/src/crypto/perlasm/readme
@@ -7,7 +7,7 @@ and then include it.
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
-The first thing we do is setup the file and type of assember
+The first thing we do is setup the file and type of assembler
&asm_init($ARGV[0],$0);
@@ -18,7 +18,7 @@ Argument 2 is the file name.
The reciprocal function is
&asm_finish() which should be called at the end.
-There are 2 main 'packages'. x86ms.pl, which is the microsoft assembler,
+There are 2 main 'packages'. x86ms.pl, which is the Microsoft assembler,
and x86unix.pl which is the unix (gas) version.
Functions of interest are:
@@ -32,7 +32,7 @@ Functions of interest are:
&function_begin(name,extra) Start a function with pushing of
edi, esi, ebx and ebp. extra is extra win32
external info that may be required.
-&function_begin_B(name,extra) Same as norma function_begin but no pushing.
+&function_begin_B(name,extra) Same as normal function_begin but no pushing.
&function_end(name) Call at end of function.
&function_end_A(name) Standard pop and ret, for use inside functions
&function_end_B(name) Call at end but with poping or 'ret'.
diff --git a/src/crypto/perlasm/x86_64-xlate.pl b/src/crypto/perlasm/x86_64-xlate.pl
index 16553f2a..6e487b8e 100755
--- a/src/crypto/perlasm/x86_64-xlate.pl
+++ b/src/crypto/perlasm/x86_64-xlate.pl
@@ -141,7 +141,7 @@ my %globals;
if ($gas) {
if ($self->{op} eq "movz") { # movz is pain...
sprintf "%s%s%s",$self->{op},$self->{sz},shift;
- } elsif ($self->{op} =~ /^set/) {
+ } elsif ($self->{op} =~ /^set/) {
"$self->{op}";
} elsif ($self->{op} eq "ret") {
my $epilogue = "";
@@ -168,7 +168,7 @@ my %globals;
$self->{op} .= $self->{sz};
} elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
$self->{op} = "\tDQ";
- }
+ }
$self->{op};
}
}
@@ -274,7 +274,7 @@ my %globals;
}
# if base register is %rbp or %r13, see if it's possible to
- # flip base and ingex registers [for better performance]
+ # flip base and index registers [for better performance]
if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
$self->{base} =~ /(rbp|r13)/) {
$self->{base} = $self->{index}; $self->{index} = $1;
@@ -432,7 +432,7 @@ my %globals;
}
}
}
-{ package expr; # pick up expressioins
+{ package expr; # pick up expressions
sub re {
my ($class, $line, $opcode) = @_;
my $self = {};
@@ -460,6 +460,242 @@ my %globals;
}
}
}
+{ package cfi_directive;
+ # CFI directives annotate instructions that are significant for
+ # stack unwinding procedure compliant with DWARF specification,
+ # see http://dwarfstd.org/. Besides naturally expected for this
+ # script platform-specific filtering function, this module adds
+ # three auxiliary synthetic directives not recognized by [GNU]
+ # assembler:
+ #
+ # - .cfi_push to annotate push instructions in prologue, which
+ # translates to .cfi_adjust_cfa_offset (if needed) and
+ # .cfi_offset;
+ # - .cfi_pop to annotate pop instructions in epilogue, which
+ # translates to .cfi_adjust_cfa_offset (if needed) and
+ # .cfi_restore;
+ # - [and most notably] .cfi_cfa_expression which encodes
+ # DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+ # byte vector;
+ #
+ # CFA expressions were introduced in DWARF specification version
+ # 3 and describe how to deduce CFA, Canonical Frame Address. This
+ # becomes handy if your stack frame is variable and you can't
+ # spare register for [previous] frame pointer. Suggested directive
+ # syntax is made-up mix of DWARF operator suffixes [subset of]
+ # and references to registers with optional bias. Following example
+ # describes offloaded *original* stack pointer at specific offset
+ # from *current* stack pointer:
+ #
+ # .cfi_cfa_expression %rsp+40,deref,+8
+ #
+ # Final +8 has everything to do with the fact that CFA is defined
+ # as reference to top of caller's stack, and on x86_64 call to
+ # subroutine pushes 8-byte return address. In other words original
+ # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+
+ # Below constants are taken from "DWARF Expressions" section of the
+ # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+ my %DW_OP_simple = ( # no-arg operators, mapped directly
+ deref => 0x06, dup => 0x12,
+ drop => 0x13, over => 0x14,
+ pick => 0x15, swap => 0x16,
+ rot => 0x17, xderef => 0x18,
+
+ abs => 0x19, and => 0x1a,
+ div => 0x1b, minus => 0x1c,
+ mod => 0x1d, mul => 0x1e,
+ neg => 0x1f, not => 0x20,
+ or => 0x21, plus => 0x22,
+ shl => 0x24, shr => 0x25,
+ shra => 0x26, xor => 0x27,
+ );
+
+ my %DW_OP_complex = ( # used in specific subroutines
+ constu => 0x10, # uleb128
+ consts => 0x11, # sleb128
+ plus_uconst => 0x23, # uleb128
+ lit0 => 0x30, # add 0-31 to opcode
+ reg0 => 0x50, # add 0-31 to opcode
+ breg0 => 0x70, # add 0-31 to opcole, sleb128
+ regx => 0x90, # uleb28
+ fbreg => 0x91, # sleb128
+ bregx => 0x92, # uleb128, sleb128
+ piece => 0x93, # uleb128
+ );
+
+ # Following constants are defined in x86_64 ABI supplement, for
+ # example avaiable at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+ # see section 3.7 "Stack Unwind Algorithm".
+ my %DW_reg_idx = (
+ "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3,
+ "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7,
+ "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11,
+ "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+ );
+
+ my ($cfa_reg, $cfa_rsp);
+
+ # [us]leb128 format is variable-length integer representation base
+ # 2^128, with most significant bit of each byte being 0 denoting
+ # *last* most significat digit. See "Variable Length Data" in the
+ # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+ sub sleb128 {
+ use integer; # get right shift extend sign
+
+ my $val = shift;
+ my $sign = ($val < 0) ? -1 : 0;
+ my @ret = ();
+
+ while(1) {
+ push @ret, $val&0x7f;
+
+ # see if remaining bits are same and equal to most
+ # significant bit of the current digit, if so, it's
+ # last digit...
+ last if (($val>>6) == $sign);
+
+ @ret[-1] |= 0x80;
+ $val >>= 7;
+ }
+
+ return @ret;
+ }
+ sub uleb128 {
+ my $val = shift;
+ my @ret = ();
+
+ while(1) {
+ push @ret, $val&0x7f;
+
+ # see if it's last significant digit...
+ last if (($val >>= 7) == 0);
+
+ @ret[-1] |= 0x80;
+ }
+
+ return @ret;
+ }
+ sub const {
+ my $val = shift;
+
+ if ($val >= 0 && $val < 32) {
+ return ($DW_OP_complex{lit0}+$val);
+ }
+ return ($DW_OP_complex{consts}, sleb128($val));
+ }
+ sub reg {
+ my $val = shift;
+
+ return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+ my $reg = $DW_reg_idx{$1};
+ my $off = eval ("0 $2 $3");
+
+ return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+ # Yes, we use DW_OP_bregX+0 to push register value and not
+ # DW_OP_regX, because latter would require even DW_OP_piece,
+ # which would be a waste under the circumstances. If you have
+ # to use DWP_OP_reg, use "regx:N"...
+ }
+ sub cfa_expression {
+ my $line = shift;
+ my @ret;
+
+ foreach my $token (split(/,\s*/,$line)) {
+ if ($token =~ /^%r/) {
+ push @ret,reg($token);
+ } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+ push @ret,reg("$2+$1");
+ } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+ my $i = 1*eval($2);
+ push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+ } elsif (my $i = 1*eval($token) or $token eq "0") {
+ if ($token =~ /^\+/) {
+ push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+ } else {
+ push @ret,const($i);
+ }
+ } else {
+ push @ret,$DW_OP_simple{$token};
+ }
+ }
+
+ # Finally we return DW_CFA_def_cfa_expression, 15, followed by
+ # length of the expression and of course the expression itself.
+ return (15,scalar(@ret),@ret);
+ }
+ sub re {
+ my ($class, $line) = @_;
+ my $self = {};
+ my $ret;
+
+ if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+ bless $self,$class;
+ $ret = $self;
+ undef $self->{value};
+ my $dir = $1;
+
+ SWITCH: for ($dir) {
+ # What is $cfa_rsp? Effectively it's difference between %rsp
+ # value and current CFA, Canonical Frame Address, which is
+ # why it starts with -8. Recall that CFA is top of caller's
+ # stack...
+ /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; };
+ /endproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", 0); last; };
+ /def_cfa_register/
+ && do { $cfa_reg = $$line; last; };
+ /def_cfa_offset/
+ && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp");
+ last;
+ };
+ /adjust_cfa_offset/
+ && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp");
+ last;
+ };
+ /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+ $cfa_reg = $1;
+ $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp");
+ }
+ last;
+ };
+ /push/ && do { $dir = undef;
+ $cfa_rsp -= 8;
+ if ($cfa_reg eq "%rsp") {
+ $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+ }
+ $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+ last;
+ };
+ /pop/ && do { $dir = undef;
+ $cfa_rsp += 8;
+ if ($cfa_reg eq "%rsp") {
+ $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+ }
+ $self->{value} .= ".cfi_restore\t$$line";
+ last;
+ };
+ /cfa_expression/
+ && do { $dir = undef;
+ $self->{value} = ".cfi_escape\t" .
+ join(",", map(sprintf("0x%02x", $_),
+ cfa_expression($$line)));
+ last;
+ };
+ }
+
+ $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+ $$line = "";
+ }
+
+ return $ret;
+ }
+ sub out {
+ my $self = shift;
+ return ($elf ? $self->{value} : undef);
+ }
+}
{ package directive; # pick up directives, which start with .
sub re {
my ($class, $line) = @_;
@@ -467,6 +703,9 @@ my %globals;
my $ret;
my $dir;
+ # chain-call to cfi_directive
+ $ret = cfi_directive->re($line) and return $ret;
+
if ($$line =~ /^\s*(\.\w+)/) {
bless $self,$class;
$dir = $1;
@@ -644,7 +883,7 @@ my %globals;
if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
{ $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
$var;
- };
+ };
$sz =~ tr/bvlrq/BWDDQ/;
$self->{value} = "\tD$sz\t";
@@ -654,7 +893,7 @@ my %globals;
};
/\.byte/ && do { my @str=split(/,\s*/,$$line);
map(s/(0b[0-1]+)/oct($1)/eig,@str);
- map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
+ map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
while ($#str>15) {
$self->{value}.="DB\t"
.join(",",@str[0..15])."\n";
@@ -810,7 +1049,7 @@ my $rdrand = sub {
my @opcode=();
my $dst=$1;
if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
- rex(\@opcode,0,$1,8);
+ rex(\@opcode,0,$dst,8);
push @opcode,0x0f,0xc7,0xf0|($dst&7);
@opcode;
} else {
@@ -823,7 +1062,7 @@ my $rdseed = sub {
my @opcode=();
my $dst=$1;
if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
- rex(\@opcode,0,$1,8);
+ rex(\@opcode,0,$dst,8);
push @opcode,0x0f,0xc7,0xf8|($dst&7);
@opcode;
} else {
@@ -912,7 +1151,7 @@ while(defined(my $line=<>)) {
printf "%s",$directive->out();
} elsif (my $opcode=opcode->re(\$line)) {
my $asm = eval("\$".$opcode->mnemonic());
-
+
if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
next;
@@ -998,7 +1237,7 @@ close STDOUT;
# %r13 - -
# %r14 - -
# %r15 - -
-#
+#
# (*) volatile register
# (-) preserved by callee
# (#) Nth argument, volatile
@@ -1021,7 +1260,7 @@ close STDOUT;
# the area above user stack pointer in true asynchronous manner...
#
# All the above means that if assembler programmer adheres to Unix
-# register and stack layout, but disregards the "red zone" existense,
+# register and stack layout, but disregards the "red zone" existence,
# it's possible to use following prologue and epilogue to "gear" from
# Unix to Win64 ABI in leaf functions with not more than 6 arguments.
#
diff --git a/src/crypto/perlasm/x86nasm.pl b/src/crypto/perlasm/x86nasm.pl
index d159514e..d3773b68 100644
--- a/src/crypto/perlasm/x86nasm.pl
+++ b/src/crypto/perlasm/x86nasm.pl
@@ -140,7 +140,7 @@ ___
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
push (@out,$comm)
}
- push (@out,$initseg) if ($initseg);
+ push (@out,$initseg) if ($initseg);
}
sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }
diff --git a/src/crypto/pkcs8/pkcs8.c b/src/crypto/pkcs8/pkcs8.c
index efad81d4..64a2d021 100644
--- a/src/crypto/pkcs8/pkcs8.c
+++ b/src/crypto/pkcs8/pkcs8.c
@@ -426,26 +426,9 @@ err:
return ret;
}
-PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass,
- int pass_len) {
- uint8_t *pass_raw = NULL;
- size_t pass_raw_len = 0;
- if (!pass_to_pass_raw(OBJ_obj2nid(pkcs8->algor->algorithm), pass, pass_len,
- &pass_raw, &pass_raw_len)) {
- return NULL;
- }
-
- PKCS8_PRIV_KEY_INFO *ret = PKCS8_decrypt_pbe(pkcs8, pass_raw, pass_raw_len);
-
- if (pass_raw) {
- OPENSSL_cleanse(pass_raw, pass_raw_len);
- OPENSSL_free(pass_raw);
- }
- return ret;
-}
-
-PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8, const uint8_t *pass_raw,
- size_t pass_raw_len) {
+static PKCS8_PRIV_KEY_INFO *pkcs8_decrypt_raw(X509_SIG *pkcs8,
+ const uint8_t *pass_raw,
+ size_t pass_raw_len) {
PKCS8_PRIV_KEY_INFO *ret = NULL;
uint8_t *in = NULL, *out = NULL;
size_t out_len = 0;
@@ -495,17 +478,16 @@ err:
return ret;
}
-X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass,
- int pass_len, const uint8_t *salt, size_t salt_len,
- int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
+PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass,
+ int pass_len) {
uint8_t *pass_raw = NULL;
size_t pass_raw_len = 0;
- if (!pass_to_pass_raw(pbe_nid, pass, pass_len, &pass_raw, &pass_raw_len)) {
+ if (!pass_to_pass_raw(OBJ_obj2nid(pkcs8->algor->algorithm), pass, pass_len,
+ &pass_raw, &pass_raw_len)) {
return NULL;
}
- X509_SIG *ret = PKCS8_encrypt_pbe(pbe_nid, cipher, pass_raw, pass_raw_len,
- salt, salt_len, iterations, p8inf);
+ PKCS8_PRIV_KEY_INFO *ret = pkcs8_decrypt_raw(pkcs8, pass_raw, pass_raw_len);
if (pass_raw) {
OPENSSL_cleanse(pass_raw, pass_raw_len);
@@ -514,10 +496,10 @@ X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass,
return ret;
}
-X509_SIG *PKCS8_encrypt_pbe(int pbe_nid, const EVP_CIPHER *cipher,
- const uint8_t *pass_raw, size_t pass_raw_len,
- const uint8_t *salt, size_t salt_len,
- int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
+static X509_SIG *pkcs8_encrypt_raw(int pbe_nid, const EVP_CIPHER *cipher,
+ const uint8_t *pass_raw, size_t pass_raw_len,
+ const uint8_t *salt, size_t salt_len,
+ int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
X509_SIG *ret = NULL;
uint8_t *plaintext = NULL, *salt_buf = NULL, *der = NULL;
int plaintext_len = -1;
@@ -609,6 +591,25 @@ err:
return ret;
}
+X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass,
+ int pass_len, const uint8_t *salt, size_t salt_len,
+ int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
+ uint8_t *pass_raw = NULL;
+ size_t pass_raw_len = 0;
+ if (!pass_to_pass_raw(pbe_nid, pass, pass_len, &pass_raw, &pass_raw_len)) {
+ return NULL;
+ }
+
+ X509_SIG *ret = pkcs8_encrypt_raw(pbe_nid, cipher, pass_raw, pass_raw_len,
+ salt, salt_len, iterations, p8inf);
+
+ if (pass_raw) {
+ OPENSSL_cleanse(pass_raw, pass_raw_len);
+ OPENSSL_free(pass_raw);
+ }
+ return ret;
+}
+
EVP_PKEY *EVP_PKCS82PKEY(PKCS8_PRIV_KEY_INFO *p8) {
uint8_t *der = NULL;
int der_len = i2d_PKCS8_PRIV_KEY_INFO(p8, &der);
@@ -758,7 +759,7 @@ static int PKCS12_handle_safe_bag(CBS *safe_bag, struct pkcs12_context *ctx) {
}
PKCS8_PRIV_KEY_INFO *pki =
- PKCS8_decrypt_pbe(encrypted, ctx->password, ctx->password_len);
+ pkcs8_decrypt_raw(encrypted, ctx->password, ctx->password_len);
X509_SIG_free(encrypted);
if (pki == NULL) {
return 0;
diff --git a/src/crypto/rsa/CMakeLists.txt b/src/crypto/rsa/CMakeLists.txt
index 969b753e..76937c1e 100644
--- a/src/crypto/rsa/CMakeLists.txt
+++ b/src/crypto/rsa/CMakeLists.txt
@@ -11,14 +11,3 @@ add_library(
padding.c
rsa_asn1.c
)
-
-add_executable(
- rsa_test
-
- rsa_test.cc
-
- $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(rsa_test crypto)
-add_dependencies(all_tests rsa_test) \ No newline at end of file
diff --git a/src/crypto/rsa/rsa_test.cc b/src/crypto/rsa/rsa_test.cc
index 306df7e3..401efdf4 100644
--- a/src/crypto/rsa/rsa_test.cc
+++ b/src/crypto/rsa/rsa_test.cc
@@ -59,6 +59,8 @@
#include <stdlib.h>
#include <string.h>
+#include <gtest/gtest.h>
+
#include <openssl/bn.h>
#include <openssl/bytestring.h>
#include <openssl/crypto.h>
@@ -66,6 +68,7 @@
#include <openssl/nid.h>
#include "../internal.h"
+#include "../test/test_util.h"
// kPlaintext is a sample plaintext.
@@ -523,191 +526,172 @@ static const uint8_t kExponent1RSAKey[] = {
0xdd, 0x02, 0x01, 0x01,
};
-static bool TestRSA(const uint8_t *der, size_t der_len,
- const uint8_t *oaep_ciphertext,
- size_t oaep_ciphertext_len) {
- bssl::UniquePtr<RSA> key(RSA_private_key_from_bytes(der, der_len));
- if (!key) {
- return false;
- }
+struct RSAEncryptParam {
+ const uint8_t *der;
+ size_t der_len;
+ const uint8_t *oaep_ciphertext;
+ size_t oaep_ciphertext_len;
+} kRSAEncryptParams[] = {
+ {kKey1, sizeof(kKey1) - 1, kOAEPCiphertext1, sizeof(kOAEPCiphertext1) - 1},
+ {kKey2, sizeof(kKey2) - 1, kOAEPCiphertext2, sizeof(kOAEPCiphertext2) - 1},
+ {kKey3, sizeof(kKey3) - 1, kOAEPCiphertext3, sizeof(kOAEPCiphertext3) - 1},
+};
- if (!RSA_check_key(key.get())) {
- fprintf(stderr, "RSA_check_key failed\n");
- return false;
- }
+class RSAEncryptTest : public testing::TestWithParam<RSAEncryptParam> {};
+
+TEST_P(RSAEncryptTest, TestKey) {
+ const auto &param = GetParam();
+ bssl::UniquePtr<RSA> key(
+ RSA_private_key_from_bytes(param.der, param.der_len));
+ ASSERT_TRUE(key);
+
+ EXPECT_TRUE(RSA_check_key(key.get()));
uint8_t ciphertext[256];
+ // Test that PKCS#1 v1.5 encryption round-trips.
size_t ciphertext_len = 0;
- if (!RSA_encrypt(key.get(), &ciphertext_len, ciphertext, sizeof(ciphertext),
- kPlaintext, kPlaintextLen, RSA_PKCS1_PADDING) ||
- ciphertext_len != RSA_size(key.get())) {
- fprintf(stderr, "PKCS#1 v1.5 encryption failed!\n");
- return false;
- }
+ ASSERT_TRUE(RSA_encrypt(key.get(), &ciphertext_len, ciphertext,
+ sizeof(ciphertext), kPlaintext, kPlaintextLen,
+ RSA_PKCS1_PADDING));
+ EXPECT_EQ(RSA_size(key.get()), ciphertext_len);
uint8_t plaintext[256];
size_t plaintext_len = 0;
- if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
- ciphertext, ciphertext_len, RSA_PKCS1_PADDING) ||
- plaintext_len != kPlaintextLen ||
- OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) {
- fprintf(stderr, "PKCS#1 v1.5 decryption failed!\n");
- return false;
- }
+ ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+ sizeof(plaintext), ciphertext, ciphertext_len,
+ RSA_PKCS1_PADDING));
+ EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len));
+ // Test that OAEP encryption round-trips.
ciphertext_len = 0;
- if (!RSA_encrypt(key.get(), &ciphertext_len, ciphertext, sizeof(ciphertext),
- kPlaintext, kPlaintextLen, RSA_PKCS1_OAEP_PADDING) ||
- ciphertext_len != RSA_size(key.get())) {
- fprintf(stderr, "OAEP encryption failed!\n");
- return false;
- }
+ ASSERT_TRUE(RSA_encrypt(key.get(), &ciphertext_len, ciphertext,
+ sizeof(ciphertext), kPlaintext, kPlaintextLen,
+ RSA_PKCS1_OAEP_PADDING));
+ EXPECT_EQ(RSA_size(key.get()), ciphertext_len);
plaintext_len = 0;
- if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
- ciphertext, ciphertext_len, RSA_PKCS1_OAEP_PADDING) ||
- plaintext_len != kPlaintextLen ||
- OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) {
- fprintf(stderr, "OAEP decryption (encrypted data) failed!\n");
- return false;
- }
+ ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+ sizeof(plaintext), ciphertext, ciphertext_len,
+ RSA_PKCS1_OAEP_PADDING));
+ EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len));
// |oaep_ciphertext| should decrypt to |kPlaintext|.
plaintext_len = 0;
- if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
- oaep_ciphertext, oaep_ciphertext_len,
- RSA_PKCS1_OAEP_PADDING) ||
- plaintext_len != kPlaintextLen ||
- OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) {
- fprintf(stderr, "OAEP decryption (test vector data) failed!\n");
- return false;
- }
+ ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+ sizeof(plaintext), param.oaep_ciphertext,
+ param.oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING));
+ EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len));
// Try decrypting corrupted ciphertexts.
- OPENSSL_memcpy(ciphertext, oaep_ciphertext, oaep_ciphertext_len);
- for (size_t i = 0; i < oaep_ciphertext_len; i++) {
+ OPENSSL_memcpy(ciphertext, param.oaep_ciphertext, param.oaep_ciphertext_len);
+ for (size_t i = 0; i < param.oaep_ciphertext_len; i++) {
+ SCOPED_TRACE(i);
ciphertext[i] ^= 1;
- if (RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
- ciphertext, oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING)) {
- fprintf(stderr, "Corrupt data decrypted!\n");
- return false;
- }
+ EXPECT_FALSE(RSA_decrypt(
+ key.get(), &plaintext_len, plaintext, sizeof(plaintext), ciphertext,
+ param.oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING));
ERR_clear_error();
ciphertext[i] ^= 1;
}
// Test truncated ciphertexts.
- for (size_t len = 0; len < oaep_ciphertext_len; len++) {
- if (RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
- ciphertext, len, RSA_PKCS1_OAEP_PADDING)) {
- fprintf(stderr, "Corrupt data decrypted!\n");
- return false;
- }
+ for (size_t len = 0; len < param.oaep_ciphertext_len; len++) {
+ SCOPED_TRACE(len);
+ EXPECT_FALSE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+ sizeof(plaintext), ciphertext, len,
+ RSA_PKCS1_OAEP_PADDING));
ERR_clear_error();
}
-
- return true;
}
-static bool TestMultiPrimeKey(int nprimes, const uint8_t *der, size_t der_size,
- const uint8_t *enc, size_t enc_size) {
- bssl::UniquePtr<RSA> rsa(d2i_RSAPrivateKey(nullptr, &der, der_size));
- if (!rsa) {
- fprintf(stderr, "%d-prime key failed to parse.\n", nprimes);
- ERR_print_errors_fp(stderr);
- return false;
- }
+INSTANTIATE_TEST_CASE_P(, RSAEncryptTest, testing::ValuesIn(kRSAEncryptParams));
+
+struct RSAMultiPrimeParam {
+ const uint8_t *der;
+ size_t der_size;
+ const uint8_t *enc;
+ size_t enc_size;
+} kRSAMultiPrimeParams[] = {
+ {kTwoPrimeKey, sizeof(kTwoPrimeKey) - 1, kTwoPrimeEncryptedMessage,
+ sizeof(kTwoPrimeEncryptedMessage)},
+ {kThreePrimeKey, sizeof(kThreePrimeKey) - 1, kThreePrimeEncryptedMessage,
+ sizeof(kThreePrimeEncryptedMessage)},
+ {kSixPrimeKey, sizeof(kSixPrimeKey) - 1, kSixPrimeEncryptedMessage,
+ sizeof(kSixPrimeEncryptedMessage)},
+};
- if (!RSA_check_key(rsa.get())) {
- fprintf(stderr, "RSA_check_key failed for %d-prime key.\n", nprimes);
- ERR_print_errors_fp(stderr);
- return false;
- }
+class RSAMultiPrimeTest : public testing::TestWithParam<RSAMultiPrimeParam> {};
+
+TEST_P(RSAMultiPrimeTest, TestDecrypt) {
+ const auto &param = GetParam();
+ bssl::UniquePtr<RSA> rsa(
+ RSA_private_key_from_bytes(param.der, param.der_size));
+ ASSERT_TRUE(rsa);
+
+ EXPECT_TRUE(RSA_check_key(rsa.get()));
uint8_t out[256];
size_t out_len;
- if (!RSA_decrypt(rsa.get(), &out_len, out, sizeof(out), enc, enc_size,
- RSA_PKCS1_PADDING) ||
- out_len != 11 ||
- OPENSSL_memcmp(out, "hello world", 11) != 0) {
- fprintf(stderr, "%d-prime key failed to decrypt.\n", nprimes);
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- return true;
+ ASSERT_TRUE(RSA_decrypt(rsa.get(), &out_len, out, sizeof(out), param.enc,
+ param.enc_size, RSA_PKCS1_PADDING));
+ EXPECT_EQ(Bytes("hello world"), Bytes(out, out_len));
}
-static bool TestMultiPrimeKeygen() {
- static const char kMessage[] = "Hello world.";
- static const size_t kBits = 1024;
- uint8_t encrypted[kBits / 8], decrypted[kBits / 8];
- size_t encrypted_len, decrypted_len;
+INSTANTIATE_TEST_CASE_P(, RSAMultiPrimeTest,
+ testing::ValuesIn(kRSAMultiPrimeParams));
+TEST(RSATest, MultiPrimeKeygen) {
bssl::UniquePtr<RSA> rsa(RSA_new());
bssl::UniquePtr<BIGNUM> e(BN_new());
- if (!rsa || !e ||
- !BN_set_word(e.get(), RSA_F4) ||
- !RSA_generate_multi_prime_key(rsa.get(), kBits, 3, e.get(), nullptr) ||
- !RSA_check_key(rsa.get()) ||
- !RSA_encrypt(rsa.get(), &encrypted_len, encrypted, sizeof(encrypted),
- (const uint8_t *)kMessage, sizeof(kMessage),
- RSA_PKCS1_PADDING) ||
- !RSA_decrypt(rsa.get(), &decrypted_len, decrypted, sizeof(decrypted),
- encrypted, encrypted_len, RSA_PKCS1_PADDING) ||
- decrypted_len != sizeof(kMessage) ||
- OPENSSL_memcmp(decrypted, kMessage, sizeof(kMessage)) != 0) {
- ERR_print_errors_fp(stderr);
- return false;
- }
+ ASSERT_TRUE(rsa);
+ ASSERT_TRUE(e);
+ ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
+
+ // Test key generation.
+ static const size_t kBits = 1024;
+ ASSERT_TRUE(
+ RSA_generate_multi_prime_key(rsa.get(), kBits, 3, e.get(), nullptr));
+ ASSERT_TRUE(RSA_check_key(rsa.get()));
- return true;
+ // Test the key round-trips.
+ static const char kMessage[] = "Hello world.";
+ uint8_t encrypted[kBits / 8], decrypted[kBits / 8];
+ size_t encrypted_len, decrypted_len;
+ ASSERT_TRUE(RSA_encrypt(rsa.get(), &encrypted_len, encrypted,
+ sizeof(encrypted), (const uint8_t *)kMessage,
+ sizeof(kMessage), RSA_PKCS1_PADDING));
+ ASSERT_TRUE(RSA_decrypt(rsa.get(), &decrypted_len, decrypted,
+ sizeof(decrypted), encrypted, encrypted_len,
+ RSA_PKCS1_PADDING));
+ EXPECT_EQ(Bytes((const uint8_t *)kMessage, sizeof(kMessage)),
+ Bytes(decrypted, decrypted_len));
}
-static bool TestBadKey() {
+TEST(RSATest, BadKey) {
bssl::UniquePtr<RSA> key(RSA_new());
bssl::UniquePtr<BIGNUM> e(BN_new());
+ ASSERT_TRUE(key);
+ ASSERT_TRUE(e);
+ ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
- if (!key || !e || !BN_set_word(e.get(), RSA_F4)) {
- return false;
- }
+ // Generate a bad key.
+ ASSERT_TRUE(RSA_generate_key_ex(key.get(), 512, e.get(), nullptr));
+ ASSERT_TRUE(BN_add(key->p, key->p, BN_value_one()));
- if (!RSA_generate_key_ex(key.get(), 512, e.get(), nullptr)) {
- fprintf(stderr, "RSA_generate_key_ex failed.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (!BN_add(key->p, key->p, BN_value_one())) {
- fprintf(stderr, "BN error.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (RSA_check_key(key.get())) {
- fprintf(stderr, "RSA_check_key passed with invalid key!\n");
- return false;
- }
+ // Bad keys are detected.
+ EXPECT_FALSE(RSA_check_key(key.get()));
+ // Bad keys may not be parsed.
uint8_t *der;
size_t der_len;
- if (!RSA_private_key_to_bytes(&der, &der_len, key.get())) {
- fprintf(stderr, "RSA_private_key_to_bytes failed to serialize bad key\n.");
- return false;
- }
+ ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, key.get()));
bssl::UniquePtr<uint8_t> delete_der(der);
-
key.reset(RSA_private_key_from_bytes(der, der_len));
- if (key) {
- fprintf(stderr, "RSA_private_key_from_bytes accepted bad key\n.");
- }
-
- ERR_clear_error();
- return true;
+ EXPECT_FALSE(key);
}
-static bool TestOnlyDGiven() {
+TEST(RSATest, OnlyDGiven) {
static const char kN[] =
"00e77bbf3889d4ef36a9a25d4d69f3f632eb4362214c74517da6d6aeaa9bd09ac42b2662"
"1cd88f3a6eb013772fc3bf9f83914b6467231c630202c35b3e5808c659";
@@ -716,253 +700,134 @@ static bool TestOnlyDGiven() {
"0365db9eb6d73b53b015c40cd8db4de7dd7035c68b5ac1bf786d7a4ee2cea316eaeca21a"
"73ac365e58713195f2ae9849348525ca855386b6d028e437a9495a01";
- uint8_t buf[64];
- unsigned buf_len = sizeof(buf);
bssl::UniquePtr<RSA> key(RSA_new());
- if (!key ||
- !BN_hex2bn(&key->n, kN) ||
- !BN_hex2bn(&key->e, kE) ||
- !BN_hex2bn(&key->d, kD) ||
- RSA_size(key.get()) > sizeof(buf)) {
- return false;
- }
+ ASSERT_TRUE(key);
+ ASSERT_TRUE(BN_hex2bn(&key->n, kN));
+ ASSERT_TRUE(BN_hex2bn(&key->e, kE));
+ ASSERT_TRUE(BN_hex2bn(&key->d, kD));
- if (!RSA_check_key(key.get())) {
- fprintf(stderr, "RSA_check_key failed with only n, d, and e given.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
+ // Keys with only n, e, and d are functional.
+ EXPECT_TRUE(RSA_check_key(key.get()));
const uint8_t kDummyHash[16] = {0};
-
- if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len,
- key.get())) {
- fprintf(stderr, "RSA_sign failed with only n, d, and e given.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len,
- key.get())) {
- fprintf(stderr, "RSA_verify failed with only n, d, and e given.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
+ uint8_t buf[64];
+ unsigned buf_len = sizeof(buf);
+ ASSERT_LE(RSA_size(key.get()), sizeof(buf));
+ EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+ &buf_len, key.get()));
+ EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+ buf_len, key.get()));
// Keys without the public exponent must continue to work when blinding is
// disabled to support Java's RSAPrivateKeySpec API. See
// https://bugs.chromium.org/p/boringssl/issues/detail?id=12.
bssl::UniquePtr<RSA> key2(RSA_new());
- if (!key2 ||
- !BN_hex2bn(&key2->n, kN) ||
- !BN_hex2bn(&key2->d, kD)) {
- return false;
- }
+ ASSERT_TRUE(key2);
+ ASSERT_TRUE(BN_hex2bn(&key2->n, kN));
+ ASSERT_TRUE(BN_hex2bn(&key2->d, kD));
key2->flags |= RSA_FLAG_NO_BLINDING;
- if (RSA_size(key2.get()) > sizeof(buf)) {
- return false;
- }
-
- if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len,
- key2.get())) {
- fprintf(stderr, "RSA_sign failed with only n and d given.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
+ ASSERT_LE(RSA_size(key2.get()), sizeof(buf));
+ EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+ &buf_len, key2.get()));
// Verify the signature with |key|. |key2| has no public exponent.
- if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len,
- key.get())) {
- fprintf(stderr,
- "Could not verify signature produced from key with only n and d "
- "given.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- return true;
+ EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+ buf_len, key.get()));
}
-static bool TestRecoverCRTParams() {
+TEST(RSATest, RecoverCRTParams) {
bssl::UniquePtr<BIGNUM> e(BN_new());
- if (!e || !BN_set_word(e.get(), RSA_F4)) {
- return false;
- }
+ ASSERT_TRUE(e);
+ ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
- ERR_clear_error();
+ bssl::UniquePtr<RSA> key1(RSA_new());
+ ASSERT_TRUE(key1);
+ ASSERT_TRUE(RSA_generate_key_ex(key1.get(), 512, e.get(), nullptr));
- for (unsigned i = 0; i < 1; i++) {
- bssl::UniquePtr<RSA> key1(RSA_new());
- if (!key1 ||
- !RSA_generate_key_ex(key1.get(), 512, e.get(), nullptr)) {
- fprintf(stderr, "RSA_generate_key_ex failed.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (!RSA_check_key(key1.get())) {
- fprintf(stderr, "RSA_check_key failed with original key.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- bssl::UniquePtr<RSA> key2(RSA_new());
- if (!key2) {
- return false;
- }
- key2->n = BN_dup(key1->n);
- key2->e = BN_dup(key1->e);
- key2->d = BN_dup(key1->d);
- if (key2->n == nullptr || key2->e == nullptr || key2->d == nullptr) {
- return false;
- }
-
- if (!RSA_recover_crt_params(key2.get())) {
- fprintf(stderr, "RSA_recover_crt_params failed.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- uint8_t buf[128];
- unsigned buf_len = sizeof(buf);
- if (RSA_size(key2.get()) > buf_len) {
- return false;
- }
-
- if (!RSA_check_key(key2.get())) {
- fprintf(stderr, "RSA_check_key failed with recovered key.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- const uint8_t kDummyHash[16] = {0};
- if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len,
- key2.get())) {
- fprintf(stderr, "RSA_sign failed with recovered key.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
-
- if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len,
- key2.get())) {
- fprintf(stderr, "RSA_verify failed with recovered key.\n");
- ERR_print_errors_fp(stderr);
- return false;
- }
- }
+ EXPECT_TRUE(RSA_check_key(key1.get()));
+
+ // Create a copy of the key without CRT parameters.
+ bssl::UniquePtr<RSA> key2(RSA_new());
+ ASSERT_TRUE(key2);
+ key2->n = BN_dup(key1->n);
+ key2->e = BN_dup(key1->e);
+ key2->d = BN_dup(key1->d);
+ ASSERT_TRUE(key2->n);
+ ASSERT_TRUE(key2->e);
+ ASSERT_TRUE(key2->d);
+
+ ASSERT_TRUE(RSA_recover_crt_params(key2.get()));
- return true;
+ // The recovered RSA parameters should work.
+ EXPECT_TRUE(RSA_check_key(key2.get()));
+
+ uint8_t buf[128];
+ unsigned buf_len = sizeof(buf);
+ ASSERT_LE(RSA_size(key2.get()), buf_len);
+
+ const uint8_t kDummyHash[16] = {0};
+ EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+ &buf_len, key2.get()));
+ EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+ buf_len, key2.get()));
}
-static bool TestASN1() {
+TEST(RSATest, ASN1) {
// Test that private keys may be decoded.
- bssl::UniquePtr<RSA> rsa(RSA_private_key_from_bytes(kKey1, sizeof(kKey1) - 1));
- if (!rsa) {
- return false;
- }
+ bssl::UniquePtr<RSA> rsa(
+ RSA_private_key_from_bytes(kKey1, sizeof(kKey1) - 1));
+ ASSERT_TRUE(rsa);
// Test that the serialization round-trips.
uint8_t *der;
size_t der_len;
- if (!RSA_private_key_to_bytes(&der, &der_len, rsa.get())) {
- return false;
- }
+ ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, rsa.get()));
bssl::UniquePtr<uint8_t> delete_der(der);
- if (der_len != sizeof(kKey1) - 1 ||
- OPENSSL_memcmp(der, kKey1, der_len) != 0) {
- return false;
- }
+ EXPECT_EQ(Bytes(kKey1, sizeof(kKey1) - 1), Bytes(der, der_len));
// Test that serializing public keys works.
- if (!RSA_public_key_to_bytes(&der, &der_len, rsa.get())) {
- return false;
- }
+ ASSERT_TRUE(RSA_public_key_to_bytes(&der, &der_len, rsa.get()));
delete_der.reset(der);
// Public keys may be parsed back out.
rsa.reset(RSA_public_key_from_bytes(der, der_len));
- if (!rsa || rsa->p != NULL || rsa->q != NULL) {
- return false;
- }
+ ASSERT_TRUE(rsa);
+ EXPECT_FALSE(rsa->p);
+ EXPECT_FALSE(rsa->q);
// Serializing the result round-trips.
uint8_t *der2;
size_t der2_len;
- if (!RSA_public_key_to_bytes(&der2, &der2_len, rsa.get())) {
- return false;
- }
+ ASSERT_TRUE(RSA_public_key_to_bytes(&der2, &der2_len, rsa.get()));
bssl::UniquePtr<uint8_t> delete_der2(der2);
- if (der_len != der2_len || OPENSSL_memcmp(der, der2, der_len) != 0) {
- return false;
- }
+ EXPECT_EQ(Bytes(der, der_len), Bytes(der2, der2_len));
// Public keys cannot be serialized as private keys.
- if (RSA_private_key_to_bytes(&der, &der_len, rsa.get())) {
+ int ok = RSA_private_key_to_bytes(&der, &der_len, rsa.get());
+ if (ok) {
OPENSSL_free(der);
- return false;
}
+ EXPECT_FALSE(ok);
ERR_clear_error();
// Public keys with negative moduli are invalid.
rsa.reset(RSA_public_key_from_bytes(kEstonianRSAKey,
sizeof(kEstonianRSAKey)));
- if (rsa) {
- return false;
- }
+ EXPECT_FALSE(rsa);
ERR_clear_error();
// But |RSA_parse_public_key_buggy| will accept it.
CBS cbs;
CBS_init(&cbs, kEstonianRSAKey, sizeof(kEstonianRSAKey));
rsa.reset(RSA_parse_public_key_buggy(&cbs));
- if (!rsa || CBS_len(&cbs) != 0) {
- return false;
- }
-
- return true;
+ EXPECT_TRUE(rsa);
+ EXPECT_EQ(0u, CBS_len(&cbs));
}
-static bool TestBadExponent() {
- bssl::UniquePtr<RSA> rsa(RSA_public_key_from_bytes(kExponent1RSAKey,
- sizeof(kExponent1RSAKey)));
-
- if (rsa) {
- fprintf(stderr, "kExponent1RSAKey parsed but should have failed.\n");
- return false;
- }
-
+TEST(RSATest, BadExponent) {
+ bssl::UniquePtr<RSA> rsa(
+ RSA_public_key_from_bytes(kExponent1RSAKey, sizeof(kExponent1RSAKey)));
+ EXPECT_FALSE(rsa);
ERR_clear_error();
- return true;
-}
-
-int main(int argc, char *argv[]) {
- CRYPTO_library_init();
-
- if (!TestRSA(kKey1, sizeof(kKey1) - 1, kOAEPCiphertext1,
- sizeof(kOAEPCiphertext1) - 1) ||
- !TestRSA(kKey2, sizeof(kKey2) - 1, kOAEPCiphertext2,
- sizeof(kOAEPCiphertext2) - 1) ||
- !TestRSA(kKey3, sizeof(kKey3) - 1, kOAEPCiphertext3,
- sizeof(kOAEPCiphertext3) - 1) ||
- !TestOnlyDGiven() ||
- !TestRecoverCRTParams() ||
- !TestBadKey() ||
- !TestMultiPrimeKey(2, kTwoPrimeKey, sizeof(kTwoPrimeKey) - 1,
- kTwoPrimeEncryptedMessage,
- sizeof(kTwoPrimeEncryptedMessage)) ||
- !TestMultiPrimeKey(3, kThreePrimeKey, sizeof(kThreePrimeKey) - 1,
- kThreePrimeEncryptedMessage,
- sizeof(kThreePrimeEncryptedMessage)) ||
- !TestMultiPrimeKey(6, kSixPrimeKey, sizeof(kSixPrimeKey) - 1,
- kSixPrimeEncryptedMessage,
- sizeof(kSixPrimeEncryptedMessage)) ||
- !TestMultiPrimeKeygen() ||
- !TestASN1() ||
- !TestBadExponent()) {
- return 1;
- }
-
- printf("PASS\n");
- return 0;
}
diff --git a/src/crypto/sha/asm/sha1-586.pl b/src/crypto/sha/asm/sha1-586.pl
index e815e2b5..acf383d4 100644
--- a/src/crypto/sha/asm/sha1-586.pl
+++ b/src/crypto/sha/asm/sha1-586.pl
@@ -97,10 +97,12 @@
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73%
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
# Haswell 6.5 4.3/+51% 4.1(**)/+58%
+# Skylake 6.4 4.1/+55% 4.1(**)/+55%
# Bulldozer 11.6 6.0/+92%
# VIA Nano 10.6 7.5/+41%
# Atom 12.5 9.3(*)/+35%
# Silvermont 14.5 9.9(*)/+46%
+# Goldmont 8.8 6.7/+30% 1.7(***)/+415%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# The discrepancy is because of front-end limitations, so
@@ -108,6 +110,8 @@
# limited parallelism.
#
# (**) As per above comment, the result is for AVX *plus* sh[rl]d.
+#
+# (***) SHAEXT result
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
diff --git a/src/crypto/sha/asm/sha1-x86_64.pl b/src/crypto/sha/asm/sha1-x86_64.pl
index ff960bb9..9a13f6c5 100644..100755
--- a/src/crypto/sha/asm/sha1-x86_64.pl
+++ b/src/crypto/sha/asm/sha1-x86_64.pl
@@ -73,13 +73,16 @@
# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
# Haswell 5.45 4.15/+31% 3.57/+53%
+# Skylake 5.18 4.06/+28% 3.54/+46%
# Bulldozer 9.11 5.95/+53%
# VIA Nano 9.32 7.15/+30%
# Atom 10.3 9.17/+12%
# Silvermont 13.1(*) 9.37/+40%
+# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
#
# (*) obviously suboptimal result, nothing was done about it,
# because SSSE3 code is compiled unconditionally;
+# (**) SHAEXT result
$flavour = shift;
$output = shift;
@@ -246,7 +249,7 @@ sha1_block_data_order:
jz .Lialu
___
$code.=<<___ if ($shaext);
- test \$`1<<29`,%r10d # check SHA bit
+ test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);
@@ -444,7 +447,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
my $j=0;
my $rx=0;
-my $K_XX_XX="%r11";
+my $K_XX_XX="%r14";
+my $fp="%r11";
my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
@@ -465,7 +469,7 @@ $code.=<<___;
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- mov %rsp,%rax
+ mov %rsp,$fp # frame pointer
push %rbx
push %rbp
push %r12
@@ -474,16 +478,15 @@ _ssse3_shortcut:
lea `-64-($win64?6*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-40-6*16(%rax)
- movaps %xmm7,-40-5*16(%rax)
- movaps %xmm8,-40-4*16(%rax)
- movaps %xmm9,-40-3*16(%rax)
- movaps %xmm10,-40-2*16(%rax)
- movaps %xmm11,-40-1*16(%rax)
+ movaps %xmm6,-40-6*16($fp)
+ movaps %xmm7,-40-5*16($fp)
+ movaps %xmm8,-40-4*16($fp)
+ movaps %xmm9,-40-3*16($fp)
+ movaps %xmm10,-40-2*16($fp)
+ movaps %xmm11,-40-1*16($fp)
.Lprologue_ssse3:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
@@ -890,21 +893,20 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+ mov -32($fp),%r13
+ mov -24($fp),%r12
+ mov -16($fp),%rbp
+ mov -8($fp),%rbx
+ lea ($fp),%rsp
.Lepilogue_ssse3:
ret
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
@@ -927,7 +929,7 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
- mov %rsp,%rax
+ mov %rsp,$fp
push %rbx
push %rbp
push %r12
@@ -937,16 +939,15 @@ _avx_shortcut:
vzeroupper
___
$code.=<<___ if ($win64);
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
@@ -1254,21 +1255,20 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+ mov -32($fp),%r13
+ mov -24($fp),%r12
+ mov -16($fp),%rbp
+ mov -8($fp),%rbx
+ lea ($fp),%rsp
.Lepilogue_avx:
ret
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
@@ -1294,7 +1294,7 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx2:
_avx2_shortcut:
- mov %rsp,%rax
+ mov %rsp,$fp
push %rbx
push %rbp
push %r12
@@ -1304,16 +1304,15 @@ _avx2_shortcut:
___
$code.=<<___ if ($win64);
lea -6*16(%rsp),%rsp
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx2:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
@@ -1733,21 +1732,20 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+ mov -32($fp),%r13
+ mov -24($fp),%r12
+ mov -16($fp),%rbp
+ mov -8($fp),%rbx
+ lea ($fp),%rsp
.Lepilogue_avx2:
ret
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
@@ -1890,15 +1888,13 @@ ssse3_handler:
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
+ mov 208($context),%rax # pull context->R11
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 232($context),%rax # pull context->R14
-
lea -40-6*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$12,%ecx
diff --git a/src/crypto/sha/asm/sha256-586.pl b/src/crypto/sha/asm/sha256-586.pl
index 8f4311b6..d85004c8 100644
--- a/src/crypto/sha/asm/sha256-586.pl
+++ b/src/crypto/sha/asm/sha256-586.pl
@@ -40,7 +40,7 @@
#
# Performance in clock cycles per processed byte (less is better):
#
-# gcc icc x86 asm(*) SIMD x86_64 asm(**)
+# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# Pentium 46 57 40/38 - -
# PIII 36 33 27/24 - -
# P4 41 38 28 - 17.3
@@ -50,14 +50,17 @@
# Sandy Bridge 25 - 15.9 12.4 11.6
# Ivy Bridge 24 - 15.0 11.4 10.3
# Haswell 22 - 13.9 9.46 7.80
+# Skylake 20 - 14.9 9.50 7.70
# Bulldozer 36 - 27/22 17.0 13.6
# VIA Nano 36 - 25/22 16.8 16.5
# Atom 50 - 30/25 21.9 18.9
# Silvermont 40 - 34/31 22.9 20.6
+# Goldmont 29 - 20 16.3(***)
#
# (*) numbers after slash are for unrolled loop, where applicable;
# (**) x86_64 assembly performance is presented for reference
# purposes, results are best-available;
+# (***) SHAEXT result is 4.1, strangely enough better than 64-bit one;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
@@ -263,7 +266,7 @@ my $suffix=shift;
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
&mov (&DWP(0,"esp"),"ebx"); # magic
- &mov ($E,&DWP(16,"esi"));
+ &mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("edi",&DWP(28,"esi"));
@@ -372,7 +375,7 @@ my @AH=($A,$K256);
&xor ($AH[1],"ecx"); # magic
&mov (&DWP(8,"esp"),"ecx");
&mov (&DWP(12,"esp"),"ebx");
- &mov ($E,&DWP(16,"esi"));
+ &mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("esi",&DWP(28,"esi"));
diff --git a/src/crypto/sha/asm/sha512-586.pl b/src/crypto/sha/asm/sha512-586.pl
index d0f91010..6d909eda 100644
--- a/src/crypto/sha/asm/sha512-586.pl
+++ b/src/crypto/sha/asm/sha512-586.pl
@@ -25,10 +25,12 @@
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
+# Skylake 40 - 26 13.3 7.25
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
# Silvermont 97 - 58 42(***) 17.5
+# Goldmont 80 - 48 19.5 12.0
#
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
@@ -376,7 +378,7 @@ if ($sse2) {
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
- #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
+ #&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);
diff --git a/src/crypto/sha/asm/sha512-armv8.pl b/src/crypto/sha/asm/sha512-armv8.pl
index 75d40431..494e6335 100644
--- a/src/crypto/sha/asm/sha512-armv8.pl
+++ b/src/crypto/sha/asm/sha512-armv8.pl
@@ -18,7 +18,7 @@
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
-#
+#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
diff --git a/src/crypto/sha/asm/sha512-x86_64.pl b/src/crypto/sha/asm/sha512-x86_64.pl
index 186aa9aa..5716791d 100644..100755
--- a/src/crypto/sha/asm/sha512-x86_64.pl
+++ b/src/crypto/sha/asm/sha512-x86_64.pl
@@ -34,7 +34,7 @@
# level parallelism, on a given CPU implementation in this case.
#
# Special note on Intel EM64T. While Opteron CPU exhibits perfect
-# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
+# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
# [currently available] EM64T CPUs apparently are far from it. On the
# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
# sha256_block:-( This is presumably because 64-bit shifts/rotates
@@ -86,12 +86,14 @@
# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
+# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
# VIA Nano 23.0 16.5(+39%) - 14.7 -
# Atom 23.0 18.9(+22%) - 14.7 -
# Silvermont 27.4 20.6(+33%) - 17.5 -
+# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
#
-# (*) whichever best applicable;
+# (*) whichever best applicable, including SHAEXT;
# (**) switch from ror to shrd stands for fair share of improvement;
# (***) execution time is fully determined by remaining integer-only
# part, body_00_15; reducing the amount of SIMD instructions
@@ -284,13 +286,13 @@ $code.=<<___ if ($SZ==4);
jnz .Lssse3_shortcut
___
$code.=<<___;
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$$framesz,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -298,7 +300,7 @@ $code.=<<___;
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
.Lprologue:
mov $SZ*0($ctx),$A
@@ -365,13 +367,13 @@ $code.=<<___;
jb .Lloop
mov $_rsp,%rsi
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue:
ret
.size $func,.-$func
@@ -744,13 +746,13 @@ $code.=<<___;
.align 64
${func}_ssse3:
.Lssse3_shortcut:
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*4`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -758,7 +760,7 @@ ${func}_ssse3:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1065,13 +1067,13 @@ $code.=<<___ if ($win64);
movaps 16*$SZ+80(%rsp),%xmm9
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_ssse3:
ret
.size ${func}_ssse3,.-${func}_ssse3
@@ -1088,13 +1090,13 @@ $code.=<<___;
.align 64
${func}_xop:
.Lxop_shortcut:
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -1102,7 +1104,7 @@ ${func}_xop:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1442,13 +1444,13 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_xop:
ret
.size ${func}_xop,.-${func}_xop
@@ -1464,13 +1466,13 @@ $code.=<<___;
.align 64
${func}_avx:
.Lavx_shortcut:
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
shl \$4,%rdx # num*16
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
@@ -1478,7 +1480,7 @@ ${func}_avx:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -1750,13 +1752,13 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx:
ret
.size ${func}_avx,.-${func}_avx
@@ -1766,7 +1768,7 @@ if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
@@ -1815,13 +1817,13 @@ $code.=<<___;
.align 64
${func}_avx2:
.Lavx2_shortcut:
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
shl \$4,%rdx # num*16
and \$-256*$SZ,%rsp # align stack frame
@@ -1830,7 +1832,7 @@ ${func}_avx2:
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %r11,$_rsp # save copy of %rsp
+ mov %rax,$_rsp # save copy of %rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
@@ -2124,13 +2126,13 @@ $code.=<<___ if ($win64 && $SZ>4);
movaps 16*$SZ+112(%rsp),%xmm11
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx2:
ret
.size ${func}_avx2,.-${func}_avx2
@@ -2192,7 +2194,6 @@ ___
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
- lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
diff --git a/src/crypto/test/test_util.h b/src/crypto/test/test_util.h
index d834973e..1447bf69 100644
--- a/src/crypto/test/test_util.h
+++ b/src/crypto/test/test_util.h
@@ -18,6 +18,7 @@
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
+#include <string.h>
#include <iosfwd>
@@ -34,6 +35,9 @@ struct Bytes {
Bytes(const uint8_t *data_arg, size_t len_arg)
: data(data_arg), len(len_arg) {}
+ Bytes(const char *str)
+ : data(reinterpret_cast<const uint8_t *>(str)), len(strlen(str)) {}
+
template <size_t N>
Bytes(const uint8_t (&array)[N]) : data(array), len(N) {}
diff --git a/src/crypto/x509/x_name.c b/src/crypto/x509/x_name.c
index f97081dc..4abdc916 100644
--- a/src/crypto/x509/x_name.c
+++ b/src/crypto/x509/x_name.c
@@ -229,12 +229,11 @@ static int x509_name_ex_d2i(ASN1_VALUE **val,
if (*val)
x509_name_ex_free(val, NULL);
+ if (!x509_name_ex_new(&nm.a, NULL))
+ goto err;
/* We've decoded it: now cache encoding */
- if (!x509_name_ex_new(&nm.a, NULL) || !BUF_MEM_grow(nm.x->bytes, p - q)) {
- sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s,
- local_sk_X509_NAME_ENTRY_pop_free);
+ if (!BUF_MEM_grow(nm.x->bytes, p - q))
goto err;
- }
OPENSSL_memcpy(nm.x->bytes->data, q, p - q);
/* Convert internal representation to X509_NAME structure */
@@ -245,13 +244,14 @@ static int x509_name_ex_d2i(ASN1_VALUE **val,
entry->set = i;
if (!sk_X509_NAME_ENTRY_push(nm.x->entries, entry))
goto err;
+ sk_X509_NAME_ENTRY_set(entries, j, NULL);
}
- sk_X509_NAME_ENTRY_free(entries);
}
- sk_STACK_OF_X509_NAME_ENTRY_free(intname.s);
ret = x509_name_canon(nm.x);
if (!ret)
goto err;
+ sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s,
+ local_sk_X509_NAME_ENTRY_free);
nm.x->modified = 0;
*val = nm.a;
*in = p;
@@ -259,6 +259,8 @@ static int x509_name_ex_d2i(ASN1_VALUE **val,
err:
if (nm.x != NULL)
X509_NAME_free(nm.x);
+ sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s,
+ local_sk_X509_NAME_ENTRY_pop_free);
OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB);
return 0;
}
@@ -307,8 +309,10 @@ static int x509_name_encode(X509_NAME *a)
entries = sk_X509_NAME_ENTRY_new_null();
if (!entries)
goto memerr;
- if (!sk_STACK_OF_X509_NAME_ENTRY_push(intname.s, entries))
+ if (!sk_STACK_OF_X509_NAME_ENTRY_push(intname.s, entries)) {
+ sk_X509_NAME_ENTRY_free(entries);
goto memerr;
+ }
set = entry->set;
}
if (!sk_X509_NAME_ENTRY_push(entries, entry))
diff --git a/src/include/openssl/ecdsa.h b/src/include/openssl/ecdsa.h
index 38907447..8a158b87 100644
--- a/src/include/openssl/ecdsa.h
+++ b/src/include/openssl/ecdsa.h
@@ -75,7 +75,7 @@ extern "C" {
* zero otherwise. */
OPENSSL_EXPORT int ECDSA_sign(int type, const uint8_t *digest,
size_t digest_len, uint8_t *sig,
- unsigned int *sig_len, EC_KEY *key);
+ unsigned int *sig_len, const EC_KEY *key);
/* ECDSA_verify verifies that |sig_len| bytes from |sig| constitute a valid
* signature by |key| of |digest|. (The |type| argument should be zero.) It
@@ -83,7 +83,7 @@ OPENSSL_EXPORT int ECDSA_sign(int type, const uint8_t *digest,
* occurred. */
OPENSSL_EXPORT int ECDSA_verify(int type, const uint8_t *digest,
size_t digest_len, const uint8_t *sig,
- size_t sig_len, EC_KEY *key);
+ size_t sig_len, const EC_KEY *key);
/* ECDSA_size returns the maximum size of an ECDSA signature using |key|. It
* returns zero on error. */
@@ -109,13 +109,13 @@ OPENSSL_EXPORT void ECDSA_SIG_free(ECDSA_SIG *sig);
/* ECDSA_do_sign signs |digest_len| bytes from |digest| with |key| and returns
* the resulting signature structure, or NULL on error. */
OPENSSL_EXPORT ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest,
- size_t digest_len, EC_KEY *key);
+ size_t digest_len, const EC_KEY *key);
/* ECDSA_do_verify verifies that |sig| constitutes a valid signature by |key|
* of |digest|. It returns one on success or zero if the signature is invalid
* or on error. */
OPENSSL_EXPORT int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
- const ECDSA_SIG *sig, EC_KEY *key);
+ const ECDSA_SIG *sig, const EC_KEY *key);
/* Signing with precomputation.
@@ -128,22 +128,22 @@ OPENSSL_EXPORT int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
/* ECDSA_sign_setup precomputes parts of an ECDSA signing operation. It sets
* |*kinv| and |*rp| to the precomputed values and uses the |ctx| argument, if
* not NULL. It returns one on success and zero otherwise. */
-OPENSSL_EXPORT int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv,
- BIGNUM **rp);
+OPENSSL_EXPORT int ECDSA_sign_setup(const EC_KEY *eckey, BN_CTX *ctx,
+ BIGNUM **kinv, BIGNUM **rp);
/* ECDSA_do_sign_ex is the same as |ECDSA_do_sign| but takes precomputed values
* as generated by |ECDSA_sign_setup|. */
OPENSSL_EXPORT ECDSA_SIG *ECDSA_do_sign_ex(const uint8_t *digest,
size_t digest_len,
const BIGNUM *kinv, const BIGNUM *rp,
- EC_KEY *eckey);
+ const EC_KEY *eckey);
/* ECDSA_sign_ex is the same as |ECDSA_sign| but takes precomputed values as
* generated by |ECDSA_sign_setup|. */
OPENSSL_EXPORT int ECDSA_sign_ex(int type, const uint8_t *digest,
size_t digest_len, uint8_t *sig,
unsigned int *sig_len, const BIGNUM *kinv,
- const BIGNUM *rp, EC_KEY *eckey);
+ const BIGNUM *rp, const EC_KEY *eckey);
/* ASN.1 functions. */
diff --git a/src/include/openssl/pkcs8.h b/src/include/openssl/pkcs8.h
index 141ed8d0..70d6f495 100644
--- a/src/include/openssl/pkcs8.h
+++ b/src/include/openssl/pkcs8.h
@@ -66,45 +66,42 @@ extern "C" {
#endif
-/* PKCS8_encrypt_pbe serializes and encrypts a PKCS8_PRIV_KEY_INFO with PBES1 or
+/* PKCS8_encrypt serializes and encrypts a PKCS8_PRIV_KEY_INFO with PBES1 or
* PBES2 as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4,
* pbeWithSHAAnd3-KeyTripleDES-CBC and pbeWithSHA1And40BitRC2, defined in PKCS
* #12, and PBES2, are supported. PBES2 is selected by setting |cipher| and
* passing -1 for |pbe_nid|. Otherwise, PBES1 is used and |cipher| is ignored.
*
- * The |pass_raw_len| bytes pointed to by |pass_raw| are used as the password.
- * Note that any conversions from the password as supplied in a text string
- * (such as those specified in B.1 of PKCS #12) must be performed by the caller.
+ * |pass| is used as the password. If a PBES1 scheme from PKCS #12 is used, this
+ * will be converted to a raw byte string as specified in B.1 of PKCS #12. If
+ * |pass| is NULL, it will be encoded as the empty byte string rather than two
+ * zero bytes, the PKCS #12 encoding of the empty string.
*
* If |salt| is NULL, a random salt of |salt_len| bytes is generated. If
* |salt_len| is zero, a default salt length is used instead.
*
- * The resulting structure is stored in an X509_SIG which must be freed by the
- * caller.
- *
- * TODO(davidben): Really? An X509_SIG? OpenSSL probably did that because it has
- * the same structure as EncryptedPrivateKeyInfo. */
-OPENSSL_EXPORT X509_SIG *PKCS8_encrypt_pbe(int pbe_nid,
- const EVP_CIPHER *cipher,
- const uint8_t *pass_raw,
- size_t pass_raw_len,
- const uint8_t *salt, size_t salt_len,
- int iterations,
- PKCS8_PRIV_KEY_INFO *p8inf);
+ * The resulting structure is stored in an |X509_SIG| which must be freed by the
+ * caller. */
+OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher,
+ const char *pass, int pass_len,
+ const uint8_t *salt, size_t salt_len,
+ int iterations,
+ PKCS8_PRIV_KEY_INFO *p8inf);
-/* PKCS8_decrypt_pbe decrypts and decodes a PKCS8_PRIV_KEY_INFO with PBES1 or
- * PBES2 as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4,
+/* PKCS8_decrypt decrypts and decodes a PKCS8_PRIV_KEY_INFO with PBES1 or PBES2
+ * as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4,
* pbeWithSHAAnd3-KeyTripleDES-CBC and pbeWithSHA1And40BitRC2, and PBES2,
* defined in PKCS #12, are supported.
*
- * The |pass_raw_len| bytes pointed to by |pass_raw| are used as the password.
- * Note that any conversions from the password as supplied in a text string
- * (such as those specified in B.1 of PKCS #12) must be performed by the caller.
+ * |pass| is used as the password. If a PBES1 scheme from PKCS #12 is used, this
+ * will be converted to a raw byte string as specified in B.1 of PKCS #12. If
+ * |pass| is NULL, it will be encoded as the empty byte string rather than two
+ * zero bytes, the PKCS #12 encoding of the empty string.
*
* The resulting structure must be freed by the caller. */
-OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8,
- const uint8_t *pass_raw,
- size_t pass_raw_len);
+OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8,
+ const char *pass,
+ int pass_len);
/* PKCS12_get_key_and_certs parses a PKCS#12 structure from |in|, authenticates
* and decrypts it using |password|, sets |*out_key| to the included private
@@ -117,24 +114,6 @@ OPENSSL_EXPORT int PKCS12_get_key_and_certs(EVP_PKEY **out_key,
/* Deprecated functions. */
-/* PKCS8_encrypt calls |PKCS8_encrypt_pbe| after (in the PKCS#12 case) treating
- * |pass| as an ASCII string, appending U+0000, and converting to UCS-2. (So the
- * empty password encodes as two NUL bytes.) In the PBES2 case, the password is
- * unchanged. */
-OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher,
- const char *pass, int pass_len,
- const uint8_t *salt, size_t salt_len,
- int iterations,
- PKCS8_PRIV_KEY_INFO *p8inf);
-
-/* PKCS8_decrypt calls PKCS8_decrypt_pbe after (in the PKCS#12 case) treating
- * |pass| as an ASCII string, appending U+0000, and converting to UCS-2. (So the
- * empty password encodes as two NUL bytes.) In the PBES2 case, the password is
- * unchanged. */
-OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8,
- const char *pass,
- int pass_len);
-
/* PKCS12_PBE_add does nothing. It exists for compatibility with OpenSSL. */
OPENSSL_EXPORT void PKCS12_PBE_add(void);
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index 497093db..23e5e9b5 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -2241,11 +2241,11 @@ OPENSSL_EXPORT void SSL_CTX_set_cert_verify_callback(
/* SSL_enable_signed_cert_timestamps causes |ssl| (which must be the client end
* of a connection) to request SCTs from the server. See
- * https://tools.ietf.org/html/rfc6962. It returns one.
+ * https://tools.ietf.org/html/rfc6962.
*
* Call |SSL_get0_signed_cert_timestamp_list| to recover the SCT after the
* handshake. */
-OPENSSL_EXPORT int SSL_enable_signed_cert_timestamps(SSL *ssl);
+OPENSSL_EXPORT void SSL_enable_signed_cert_timestamps(SSL *ssl);
/* SSL_CTX_enable_signed_cert_timestamps enables SCT requests on all client SSL
* objects created from |ctx|.
@@ -2255,12 +2255,11 @@ OPENSSL_EXPORT int SSL_enable_signed_cert_timestamps(SSL *ssl);
OPENSSL_EXPORT void SSL_CTX_enable_signed_cert_timestamps(SSL_CTX *ctx);
/* SSL_enable_ocsp_stapling causes |ssl| (which must be the client end of a
- * connection) to request a stapled OCSP response from the server. It returns
- * one.
+ * connection) to request a stapled OCSP response from the server.
*
* Call |SSL_get0_ocsp_response| to recover the OCSP response after the
* handshake. */
-OPENSSL_EXPORT int SSL_enable_ocsp_stapling(SSL *ssl);
+OPENSSL_EXPORT void SSL_enable_ocsp_stapling(SSL *ssl);
/* SSL_CTX_enable_ocsp_stapling enables OCSP stapling on all client SSL objects
* created from |ctx|.
@@ -3043,7 +3042,6 @@ OPENSSL_EXPORT void SSL_CTX_set_dos_protection_cb(
#define SSL_ST_OK 0x03
#define SSL_ST_RENEGOTIATE (0x04 | SSL_ST_INIT)
#define SSL_ST_TLS13 (0x05 | SSL_ST_INIT)
-#define SSL_ST_ERROR (0x06| SSL_ST_INIT)
/* SSL_CB_* are possible values for the |type| parameter in the info
* callback and the bitmasks that make them up. */
@@ -3086,8 +3084,7 @@ OPENSSL_EXPORT void SSL_CTX_set_dos_protection_cb(
*
* |SSL_CB_ACCEPT_LOOP| (respectively, |SSL_CB_CONNECT_LOOP|) is signaled when
* a server (respectively, client) handshake progresses. The |value| argument
- * is always one. For the duration of the callback, |SSL_state| will return the
- * previous state.
+ * is always one.
*
* |SSL_CB_ACCEPT_EXIT| (respectively, |SSL_CB_CONNECT_EXIT|) is signaled when
* a server (respectively, client) handshake completes, fails, or is paused.
@@ -3589,7 +3586,10 @@ OPENSSL_EXPORT const char *SSL_alert_desc_string(int value);
typedef struct ssl_conf_ctx_st SSL_CONF_CTX;
-/* SSL_state returns the current state of the handshake state machine. */
+/* SSL_state returns |SSL_ST_INIT| if a handshake is in progress and |SSL_ST_OK|
+ * otherwise.
+ *
+ * Use |SSL_is_init| instead. */
OPENSSL_EXPORT int SSL_state(const SSL *ssl);
#define SSL_get_state(ssl) SSL_state(ssl)
@@ -3805,6 +3805,12 @@ struct ssl_session_st {
* early data. If zero, 0-RTT is disallowed. */
uint32_t ticket_max_early_data;
+ /* early_alpn is the ALPN protocol from the initial handshake. This is only
+ * stored for TLS 1.3 and above in order to enforce ALPN matching for 0-RTT
+ * resumptions. */
+ uint8_t *early_alpn;
+ size_t early_alpn_len;
+
/* extended_master_secret is true if the master secret in this session was
* generated using EMS and thus isn't vulnerable to the Triple Handshake
* attack. */
@@ -3965,8 +3971,6 @@ struct ssl_ctx_st {
void *msg_callback_arg;
int verify_mode;
- uint8_t sid_ctx_length;
- uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
int (*default_verify_callback)(
int ok, X509_STORE_CTX *ctx); /* called 'verify_callback' in the SSL */
@@ -4061,12 +4065,6 @@ struct ssl_ctx_st {
/* The client's Channel ID private key. */
EVP_PKEY *tlsext_channel_id_private;
- /* Signed certificate timestamp list to be sent to the client, if requested */
- CRYPTO_BUFFER *signed_cert_timestamp_list;
-
- /* OCSP response to be sent to the client, if requested. */
- CRYPTO_BUFFER *ocsp_response;
-
/* keylog_callback, if not NULL, is the key logging callback. See
* |SSL_CTX_set_keylog_callback|. */
void (*keylog_callback)(const SSL *ssl, const char *line);
@@ -4107,9 +4105,6 @@ struct ssl_ctx_st {
/* short_header_enabled is one if a short record header in TLS 1.3 may
* be negotiated and zero otherwise. */
unsigned short_header_enabled:1;
-
- /* TODO(agl): remove once node.js no longer references this. */
- int freelist_max_len;
};
diff --git a/src/include/openssl/ssl3.h b/src/include/openssl/ssl3.h
index 6a03d1be..fcaeb2df 100644
--- a/src/include/openssl/ssl3.h
+++ b/src/include/openssl/ssl3.h
@@ -307,6 +307,7 @@ OPENSSL_COMPILE_ASSERT(
#define SSL3_ST_CW_FLUSH (0x100 | SSL_ST_CONNECT)
#define SSL3_ST_FALSE_START (0x101 | SSL_ST_CONNECT)
#define SSL3_ST_VERIFY_SERVER_CERT (0x102 | SSL_ST_CONNECT)
+#define SSL3_ST_FINISH_CLIENT_HANDSHAKE (0x103 | SSL_ST_CONNECT)
/* write to server */
#define SSL3_ST_CW_CLNT_HELLO_A (0x110 | SSL_ST_CONNECT)
/* read from server */
diff --git a/src/include/openssl/time_support.h b/src/include/openssl/time_support.h
deleted file mode 100644
index 274b17d1..00000000
--- a/src/include/openssl/time_support.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Written by Richard Levitte (richard@levitte.org) for the OpenSSL
- * project 2001.
- * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
- * project 2008.
- */
-/* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- * software must display the following acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- * endorse or promote products derived from this software without
- * prior written permission. For written permission, please contact
- * licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- * nor may "OpenSSL" appear in their names without prior written
- * permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- * acknowledgment:
- * "This product includes software developed by the OpenSSL Project
- * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com). This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com). */
-
-#ifndef OPENSSL_HEADER_TIME_SUPPORT_H
-#define OPENSSL_HEADER_TIME_SUPPORT_H
-
-#include <openssl/base.h>
-
-#include <time.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-/* Wrapper functions for time functions. */
-
-
-/* OPENSSL_gmtime wraps |gmtime_r|. See the manual page for that function. */
-struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result);
-
-/* OPENSSL_gmtime_adj updates |tm| by adding |offset_day| days and |offset_sec|
- * seconds. */
-int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec);
-
-/* OPENSSL_gmtime_diff calculates the difference between |from| and |to| and
- * outputs the difference as a number of days and seconds in |*out_days| and
- * |*out_secs|. */
-int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from,
- const struct tm *to);
-
-
-#if defined(__cplusplus)
-} /* extern C */
-#endif
-
-#endif /* OPENSSL_HEADER_TIME_SUPPORT_H */
diff --git a/src/ssl/handshake_client.c b/src/ssl/handshake_client.c
index 427213c2..c4f5e8e9 100644
--- a/src/ssl/handshake_client.c
+++ b/src/ssl/handshake_client.c
@@ -190,21 +190,15 @@ static int ssl3_get_new_session_ticket(SSL_HANDSHAKE *hs);
int ssl3_connect(SSL_HANDSHAKE *hs) {
SSL *const ssl = hs->ssl;
int ret = -1;
- int state, skip = 0;
assert(ssl->handshake_func == ssl3_connect);
assert(!ssl->server);
for (;;) {
- state = hs->state;
+ int state = hs->state;
switch (hs->state) {
case SSL_ST_INIT:
- hs->state = SSL_ST_CONNECT;
- skip = 1;
- break;
-
- case SSL_ST_CONNECT:
ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_START, 1);
hs->state = SSL3_ST_CW_CLNT_HELLO_A;
break;
@@ -254,13 +248,11 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
break;
case SSL3_ST_CR_CERT_A:
- if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
ret = ssl3_get_server_certificate(hs);
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CR_CERT_STATUS_A;
break;
@@ -271,20 +263,16 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_VERIFY_SERVER_CERT;
break;
case SSL3_ST_VERIFY_SERVER_CERT:
- if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
ret = ssl3_verify_server_cert(hs);
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CR_KEY_EXCH_A;
break;
@@ -298,13 +286,11 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
break;
case SSL3_ST_CR_CERT_REQ_A:
- if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
ret = ssl3_get_certificate_request(hs);
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CR_SRVR_DONE_A;
break;
@@ -324,8 +310,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CW_KEY_EXCH_A;
break;
@@ -345,8 +329,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CW_CHANGE;
break;
@@ -367,8 +349,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CW_CHANNEL_ID_A;
break;
@@ -379,8 +359,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CW_FINISHED_A;
break;
@@ -393,7 +371,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
hs->state = SSL3_ST_CW_FLUSH;
if (ssl->session != NULL) {
- hs->next_state = SSL_ST_OK;
+ hs->next_state = SSL3_ST_FINISH_CLIENT_HANDSHAKE;
} else {
/* This is a non-resumption handshake. If it involves ChannelID, then
* record the handshake hashes at this point in the session so that
@@ -427,8 +405,6 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_CR_CHANGE;
break;
@@ -456,7 +432,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ssl->session != NULL) {
hs->state = SSL3_ST_CW_CHANGE;
} else {
- hs->state = SSL_ST_OK;
+ hs->state = SSL3_ST_FINISH_CLIENT_HANDSHAKE;
}
break;
@@ -466,7 +442,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
goto end;
}
hs->state = hs->next_state;
- if (hs->state != SSL_ST_OK) {
+ if (hs->state != SSL3_ST_FINISH_CLIENT_HANDSHAKE) {
ssl->method->expect_flight(ssl);
}
break;
@@ -476,10 +452,10 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- hs->state = SSL_ST_OK;
+ hs->state = SSL3_ST_FINISH_CLIENT_HANDSHAKE;
break;
- case SSL_ST_OK:
+ case SSL3_ST_FINISH_CLIENT_HANDSHAKE:
ssl->method->release_current_message(ssl, 1 /* free_buffer */);
SSL_SESSION_free(ssl->s3->established_session);
@@ -491,21 +467,21 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
* of the new established_session due to False Start. The caller may
* have taken a reference to the temporary session. */
ssl->s3->established_session =
- SSL_SESSION_dup(ssl->s3->new_session, SSL_SESSION_DUP_ALL);
+ SSL_SESSION_dup(hs->new_session, SSL_SESSION_DUP_ALL);
if (ssl->s3->established_session == NULL) {
- /* Do not stay in SSL_ST_OK, to avoid confusing |SSL_in_init|
- * callers. */
- hs->state = SSL_ST_ERROR;
- skip = 1;
ret = -1;
goto end;
}
ssl->s3->established_session->not_resumable = 0;
- SSL_SESSION_free(ssl->s3->new_session);
- ssl->s3->new_session = NULL;
+ SSL_SESSION_free(hs->new_session);
+ hs->new_session = NULL;
}
+ hs->state = SSL_ST_OK;
+ break;
+
+ case SSL_ST_OK: {
const int is_initial_handshake = !ssl->s3->initial_handshake_complete;
ssl->s3->initial_handshake_complete = 1;
if (is_initial_handshake) {
@@ -516,11 +492,7 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
ret = 1;
ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_DONE, 1);
goto end;
-
- case SSL_ST_ERROR:
- OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_HANDSHAKE_FAILURE);
- ret = -1;
- goto end;
+ }
default:
OPENSSL_PUT_ERROR(SSL, SSL_R_UNKNOWN_STATE);
@@ -528,13 +500,9 @@ int ssl3_connect(SSL_HANDSHAKE *hs) {
goto end;
}
- if (!ssl->s3->tmp.reuse_message && !skip && hs->state != state) {
- int new_state = hs->state;
- hs->state = state;
+ if (hs->state != state) {
ssl_do_info_callback(ssl, SSL_CB_CONNECT_LOOP, 1);
- hs->state = new_state;
}
- skip = 0;
}
end:
@@ -944,9 +912,9 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) {
goto f_err;
}
/* Note: session_id could be empty. */
- ssl->s3->new_session->session_id_length = CBS_len(&session_id);
- OPENSSL_memcpy(ssl->s3->new_session->session_id, CBS_data(&session_id),
- CBS_len(&session_id));
+ hs->new_session->session_id_length = CBS_len(&session_id);
+ OPENSSL_memcpy(hs->new_session->session_id, CBS_data(&session_id),
+ CBS_len(&session_id));
}
const SSL_CIPHER *c = SSL_get_cipher_by_value(cipher_suite);
@@ -988,9 +956,9 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) {
goto f_err;
}
} else {
- ssl->s3->new_session->cipher = c;
+ hs->new_session->cipher = c;
}
- ssl->s3->tmp.new_cipher = c;
+ hs->new_cipher = c;
/* Now that the cipher is known, initialize the handshake hash and hash the
* ServerHello. */
@@ -1004,7 +972,7 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) {
* which requires hashing the handshake transcript. Otherwise, the handshake
* buffer may be released. */
if (ssl->session != NULL ||
- !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ !ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
SSL_TRANSCRIPT_free_buffer(&hs->transcript);
}
@@ -1030,8 +998,7 @@ static int ssl3_get_server_hello(SSL_HANDSHAKE *hs) {
}
if (ssl->session != NULL &&
- ssl->s3->tmp.extended_master_secret !=
- ssl->session->extended_master_secret) {
+ hs->extended_master_secret != ssl->session->extended_master_secret) {
al = SSL_AD_HANDSHAKE_FAILURE;
if (ssl->session->extended_master_secret) {
OPENSSL_PUT_ERROR(SSL, SSL_R_RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION);
@@ -1065,27 +1032,27 @@ static int ssl3_get_server_certificate(SSL_HANDSHAKE *hs) {
CBS_init(&cbs, ssl->init_msg, ssl->init_num);
uint8_t alert = SSL_AD_DECODE_ERROR;
- sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free);
+ sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
EVP_PKEY_free(hs->peer_pubkey);
hs->peer_pubkey = NULL;
- ssl->s3->new_session->certs = ssl_parse_cert_chain(
- &alert, &hs->peer_pubkey, NULL, &cbs, ssl->ctx->pool);
- if (ssl->s3->new_session->certs == NULL) {
+ hs->new_session->certs = ssl_parse_cert_chain(&alert, &hs->peer_pubkey, NULL,
+ &cbs, ssl->ctx->pool);
+ if (hs->new_session->certs == NULL) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, alert);
return -1;
}
- if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0 ||
+ if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0 ||
CBS_len(&cbs) != 0 ||
- !ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) {
+ !ssl->ctx->x509_method->session_cache_objects(hs->new_session)) {
OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR);
return -1;
}
if (!ssl_check_leaf_certificate(
- ssl, hs->peer_pubkey,
- sk_CRYPTO_BUFFER_value(ssl->s3->new_session->certs, 0))) {
+ hs, hs->peer_pubkey,
+ sk_CRYPTO_BUFFER_value(hs->new_session->certs, 0))) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER);
return -1;
}
@@ -1126,8 +1093,8 @@ static int ssl3_get_cert_status(SSL_HANDSHAKE *hs) {
goto f_err;
}
- if (!CBS_stow(&ocsp_response, &ssl->s3->new_session->ocsp_response,
- &ssl->s3->new_session->ocsp_response_length)) {
+ if (!CBS_stow(&ocsp_response, &hs->new_session->ocsp_response,
+ &hs->new_session->ocsp_response_length)) {
al = SSL_AD_INTERNAL_ERROR;
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
goto f_err;
@@ -1141,8 +1108,8 @@ f_err:
static int ssl3_verify_server_cert(SSL_HANDSHAKE *hs) {
SSL *const ssl = hs->ssl;
- if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result,
- ssl->s3->new_session->x509_chain)) {
+ if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result,
+ hs->new_session->x509_chain)) {
return -1;
}
@@ -1163,7 +1130,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) {
if (ssl->s3->tmp.message_type != SSL3_MT_SERVER_KEY_EXCHANGE) {
/* Some ciphers (pure PSK) have an optional ServerKeyExchange message. */
- if (ssl_cipher_requires_server_key_exchange(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_requires_server_key_exchange(hs->new_cipher)) {
OPENSSL_PUT_ERROR(SSL, SSL_R_UNEXPECTED_MESSAGE);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_UNEXPECTED_MESSAGE);
return -1;
@@ -1182,8 +1149,8 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) {
CBS_init(&server_key_exchange, ssl->init_msg, ssl->init_num);
CBS server_key_exchange_orig = server_key_exchange;
- uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
- uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+ uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+ uint32_t alg_a = hs->new_cipher->algorithm_auth;
if (alg_a & SSL_aPSK) {
CBS psk_identity_hint;
@@ -1279,7 +1246,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) {
OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
goto f_err;
}
- ssl->s3->new_session->group_id = group_id;
+ hs->new_session->group_id = group_id;
/* Ensure the group is consistent with preferences. */
if (!tls1_check_group_id(ssl, group_id)) {
@@ -1307,7 +1274,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) {
CBS_len(&server_key_exchange_orig) - CBS_len(&server_key_exchange));
/* ServerKeyExchange should be signed by the server's public key. */
- if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
uint16_t signature_algorithm = 0;
if (ssl3_protocol_version(ssl) >= TLS1_2_VERSION) {
if (!CBS_get_u16(&server_key_exchange, &signature_algorithm)) {
@@ -1318,7 +1285,7 @@ static int ssl3_get_server_key_exchange(SSL_HANDSHAKE *hs) {
if (!tls12_check_peer_sigalg(ssl, &al, signature_algorithm)) {
goto f_err;
}
- ssl->s3->new_session->peer_signature_algorithm = signature_algorithm;
+ hs->new_session->peer_signature_algorithm = signature_algorithm;
} else if (hs->peer_pubkey->type == EVP_PKEY_RSA) {
signature_algorithm = SSL_SIGN_RSA_PKCS1_MD5_SHA1;
} else if (hs->peer_pubkey->type == EVP_PKEY_EC) {
@@ -1527,8 +1494,8 @@ static int ssl3_send_client_key_exchange(SSL_HANDSHAKE *hs) {
goto err;
}
- uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
- uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+ uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+ uint32_t alg_a = hs->new_cipher->algorithm_auth;
/* If using a PSK key exchange, prepare the pre-shared key. */
unsigned psk_len = 0;
@@ -1551,9 +1518,9 @@ static int ssl3_send_client_key_exchange(SSL_HANDSHAKE *hs) {
}
assert(psk_len <= PSK_MAX_PSK_LEN);
- OPENSSL_free(ssl->s3->new_session->psk_identity);
- ssl->s3->new_session->psk_identity = BUF_strdup(identity);
- if (ssl->s3->new_session->psk_identity == NULL) {
+ OPENSSL_free(hs->new_session->psk_identity);
+ hs->new_session->psk_identity = BUF_strdup(identity);
+ if (hs->new_session->psk_identity == NULL) {
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
goto err;
}
@@ -1676,13 +1643,12 @@ static int ssl3_send_client_key_exchange(SSL_HANDSHAKE *hs) {
goto err;
}
- ssl->s3->new_session->master_key_length = tls1_generate_master_secret(
- hs, ssl->s3->new_session->master_key, pms, pms_len);
- if (ssl->s3->new_session->master_key_length == 0) {
+ hs->new_session->master_key_length = tls1_generate_master_secret(
+ hs, hs->new_session->master_key, pms, pms_len);
+ if (hs->new_session->master_key_length == 0) {
goto err;
}
- ssl->s3->new_session->extended_master_secret =
- ssl->s3->tmp.extended_master_secret;
+ hs->new_session->extended_master_secret = hs->extended_master_secret;
OPENSSL_cleanse(pms, pms_len);
OPENSSL_free(pms);
@@ -1740,9 +1706,9 @@ static int ssl3_send_cert_verify(SSL_HANDSHAKE *hs) {
uint8_t digest[EVP_MAX_MD_SIZE];
size_t digest_len;
- if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(
- &hs->transcript, digest, &digest_len, ssl->s3->new_session,
- signature_algorithm)) {
+ if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(&hs->transcript, digest,
+ &digest_len, hs->new_session,
+ signature_algorithm)) {
goto err;
}
@@ -1870,7 +1836,7 @@ static int ssl3_get_new_session_ticket(SSL_HANDSHAKE *hs) {
}
int session_renewed = ssl->session != NULL;
- SSL_SESSION *session = ssl->s3->new_session;
+ SSL_SESSION *session = hs->new_session;
if (session_renewed) {
/* The server is sending a new ticket for an existing session. Sessions are
* immutable once established, so duplicate all but the ticket of the
diff --git a/src/ssl/handshake_server.c b/src/ssl/handshake_server.c
index c352dd95..51338e22 100644
--- a/src/ssl/handshake_server.c
+++ b/src/ssl/handshake_server.c
@@ -202,21 +202,15 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
SSL *const ssl = hs->ssl;
uint32_t alg_a;
int ret = -1;
- int state, skip = 0;
assert(ssl->handshake_func == ssl3_accept);
assert(ssl->server);
for (;;) {
- state = hs->state;
+ int state = hs->state;
switch (hs->state) {
case SSL_ST_INIT:
- hs->state = SSL_ST_ACCEPT;
- skip = 1;
- break;
-
- case SSL_ST_ACCEPT:
ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_START, 1);
hs->state = SSL3_ST_SR_CLNT_HELLO_A;
break;
@@ -269,13 +263,11 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
break;
case SSL3_ST_SW_CERT_A:
- if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
ret = ssl3_send_server_certificate(hs);
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SW_CERT_STATUS_A;
break;
@@ -286,25 +278,21 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SW_KEY_EXCH_A;
break;
case SSL3_ST_SW_KEY_EXCH_A:
case SSL3_ST_SW_KEY_EXCH_B:
- alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+ alg_a = hs->new_cipher->algorithm_auth;
/* PSK ciphers send ServerKeyExchange if there is an identity hint. */
- if (ssl_cipher_requires_server_key_exchange(ssl->s3->tmp.new_cipher) ||
+ if (ssl_cipher_requires_server_key_exchange(hs->new_cipher) ||
((alg_a & SSL_aPSK) && ssl->psk_identity_hint)) {
ret = ssl3_send_server_key_exchange(hs);
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SW_CERT_REQ_A;
@@ -316,8 +304,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SW_SRVR_DONE_A;
break;
@@ -379,8 +365,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SR_CHANNEL_ID_A;
break;
@@ -391,8 +375,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SR_FINISHED_A;
break;
@@ -411,7 +393,7 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
}
/* If this is a full handshake with ChannelID then record the handshake
- * hashes in |ssl->s3->new_session| in case we need them to verify a
+ * hashes in |hs->new_session| in case we need them to verify a
* ChannelID signature on a resumption of this session in the future. */
if (ssl->session == NULL && ssl->s3->tlsext_channel_id_valid) {
ret = tls1_record_handshake_hashes_for_channel_id(hs);
@@ -427,8 +409,6 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
if (ret <= 0) {
goto end;
}
- } else {
- skip = 1;
}
hs->state = SSL3_ST_SW_CHANGE;
break;
@@ -481,12 +461,11 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
/* If we aren't retaining peer certificates then we can discard it
* now. */
- if (ssl->s3->new_session != NULL &&
+ if (hs->new_session != NULL &&
ssl->retain_only_sha256_of_client_certs) {
- sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs,
- CRYPTO_BUFFER_free);
- ssl->s3->new_session->certs = NULL;
- ssl->ctx->x509_method->session_clear(ssl->s3->new_session);
+ sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
+ hs->new_session->certs = NULL;
+ ssl->ctx->x509_method->session_clear(hs->new_session);
}
SSL_SESSION_free(ssl->s3->established_session);
@@ -494,9 +473,9 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
SSL_SESSION_up_ref(ssl->session);
ssl->s3->established_session = ssl->session;
} else {
- ssl->s3->established_session = ssl->s3->new_session;
+ ssl->s3->established_session = hs->new_session;
ssl->s3->established_session->not_resumable = 0;
- ssl->s3->new_session = NULL;
+ hs->new_session = NULL;
}
if (hs->v2_clienthello) {
@@ -518,13 +497,9 @@ int ssl3_accept(SSL_HANDSHAKE *hs) {
goto end;
}
- if (!ssl->s3->tmp.reuse_message && !skip && hs->state != state) {
- int new_state = hs->state;
- hs->state = state;
+ if (hs->state != state) {
ssl_do_info_callback(ssl, SSL_CB_ACCEPT_LOOP, 1);
- hs->state = new_state;
}
- skip = 0;
}
end:
@@ -921,9 +896,9 @@ static int ssl3_select_certificate(SSL_HANDSHAKE *hs) {
/* Negotiate the cipher suite. This must be done after |cert_cb| so the
* certificate is finalized. */
- ssl->s3->tmp.new_cipher =
+ hs->new_cipher =
ssl3_choose_cipher(hs, &client_hello, ssl_get_cipher_preferences(ssl));
- if (ssl->s3->tmp.new_cipher == NULL) {
+ if (hs->new_cipher == NULL) {
OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
return -1;
@@ -958,8 +933,7 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) {
}
if (session != NULL) {
- if (session->extended_master_secret &&
- !ssl->s3->tmp.extended_master_secret) {
+ if (session->extended_master_secret && !hs->extended_master_secret) {
/* A ClientHello without EMS that attempts to resume a session with EMS
* is fatal to the connection. */
al = SSL_AD_HANDSHAKE_FAILURE;
@@ -967,11 +941,10 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) {
goto f_err;
}
- if (!ssl_session_is_resumable(ssl, session) ||
+ if (!ssl_session_is_resumable(hs, session) ||
/* If the client offers the EMS extension, but the previous session
* didn't use it, then negotiate a new session. */
- ssl->s3->tmp.extended_master_secret !=
- session->extended_master_secret) {
+ hs->extended_master_secret != session->extended_master_secret) {
SSL_SESSION_free(session);
session = NULL;
}
@@ -992,7 +965,7 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) {
/* Clear the session ID if we want the session to be single-use. */
if (!(ssl->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER)) {
- ssl->s3->new_session->session_id_length = 0;
+ hs->new_session->session_id_length = 0;
}
}
@@ -1005,13 +978,13 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) {
}
if (ssl->session == NULL) {
- ssl->s3->new_session->cipher = ssl->s3->tmp.new_cipher;
+ hs->new_session->cipher = hs->new_cipher;
/* On new sessions, stash the SNI value in the session. */
if (hs->hostname != NULL) {
- OPENSSL_free(ssl->s3->new_session->tlsext_hostname);
- ssl->s3->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
- if (ssl->s3->new_session->tlsext_hostname == NULL) {
+ OPENSSL_free(hs->new_session->tlsext_hostname);
+ hs->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
+ if (hs->new_session->tlsext_hostname == NULL) {
al = SSL_AD_INTERNAL_ERROR;
goto f_err;
}
@@ -1025,14 +998,14 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) {
hs->cert_request = 0;
}
/* CertificateRequest may only be sent in certificate-based ciphers. */
- if (!ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (!ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
hs->cert_request = 0;
}
if (!hs->cert_request) {
/* OpenSSL returns X509_V_OK when no certificates are requested. This is
* classed by them as a bug, but it's assumed by at least NGINX. */
- ssl->s3->new_session->verify_result = X509_V_OK;
+ hs->new_session->verify_result = X509_V_OK;
}
}
@@ -1045,7 +1018,7 @@ static int ssl3_select_parameters(SSL_HANDSHAKE *hs) {
/* Now that all parameters are known, initialize the handshake hash and hash
* the ClientHello. */
if (!SSL_TRANSCRIPT_init_hash(&hs->transcript, ssl3_protocol_version(ssl),
- ssl->s3->tmp.new_cipher->algorithm_prf) ||
+ hs->new_cipher->algorithm_prf) ||
!ssl_hash_current_message(hs)) {
goto f_err;
}
@@ -1073,7 +1046,7 @@ static int ssl3_send_server_hello(SSL_HANDSHAKE *hs) {
/* We only accept ChannelIDs on connections with ECDHE in order to avoid a
* known attack while we fix ChannelID itself. */
if (ssl->s3->tlsext_channel_id_valid &&
- (ssl->s3->tmp.new_cipher->algorithm_mkey & SSL_kECDHE) == 0) {
+ (hs->new_cipher->algorithm_mkey & SSL_kECDHE) == 0) {
ssl->s3->tlsext_channel_id_valid = 0;
}
@@ -1098,7 +1071,7 @@ static int ssl3_send_server_hello(SSL_HANDSHAKE *hs) {
/* TODO(davidben): Implement the TLS 1.1 and 1.2 downgrade sentinels once TLS
* 1.3 is finalized and we are not implementing a draft version. */
- const SSL_SESSION *session = ssl->s3->new_session;
+ const SSL_SESSION *session = hs->new_session;
if (ssl->session != NULL) {
session = ssl->session;
}
@@ -1110,7 +1083,7 @@ static int ssl3_send_server_hello(SSL_HANDSHAKE *hs) {
!CBB_add_u8_length_prefixed(&body, &session_id) ||
!CBB_add_bytes(&session_id, session->session_id,
session->session_id_length) ||
- !CBB_add_u16(&body, ssl_cipher_get_value(ssl->s3->tmp.new_cipher)) ||
+ !CBB_add_u16(&body, ssl_cipher_get_value(hs->new_cipher)) ||
!CBB_add_u8(&body, 0 /* no compression */) ||
!ssl_add_serverhello_tlsext(hs, &body) ||
!ssl_add_message_cbb(ssl, &cbb)) {
@@ -1142,8 +1115,9 @@ static int ssl3_send_certificate_status(SSL_HANDSHAKE *hs) {
SSL3_MT_CERTIFICATE_STATUS) ||
!CBB_add_u8(&body, TLSEXT_STATUSTYPE_ocsp) ||
!CBB_add_u24_length_prefixed(&body, &ocsp_response) ||
- !CBB_add_bytes(&ocsp_response, CRYPTO_BUFFER_data(ssl->ocsp_response),
- CRYPTO_BUFFER_len(ssl->ocsp_response)) ||
+ !CBB_add_bytes(&ocsp_response,
+ CRYPTO_BUFFER_data(ssl->cert->ocsp_response),
+ CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) ||
!ssl_add_message_cbb(ssl, &cbb)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
CBB_cleanup(&cbb);
@@ -1160,8 +1134,8 @@ static int ssl3_send_server_key_exchange(SSL_HANDSHAKE *hs) {
/* Put together the parameters. */
if (hs->state == SSL3_ST_SW_KEY_EXCH_A) {
- uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
- uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+ uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+ uint32_t alg_a = hs->new_cipher->algorithm_auth;
/* Pre-allocate enough room to comfortably fit an ECDHE public key. */
if (!CBB_init(&cbb, 128)) {
@@ -1214,7 +1188,7 @@ static int ssl3_send_server_key_exchange(SSL_HANDSHAKE *hs) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
goto err;
}
- ssl->s3->new_session->group_id = group_id;
+ hs->new_session->group_id = group_id;
/* Set up ECDH, generate a key, and emit the public half. */
if (!SSL_ECDH_CTX_init(&hs->ecdh_ctx, group_id) ||
@@ -1242,7 +1216,7 @@ static int ssl3_send_server_key_exchange(SSL_HANDSHAKE *hs) {
}
/* Add a signature. */
- if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
if (!ssl_has_private_key(ssl)) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
goto err;
@@ -1439,7 +1413,7 @@ static int ssl3_get_client_certificate(SSL_HANDSHAKE *hs) {
/* OpenSSL returns X509_V_OK when no certificates are received. This is
* classed by them as a bug, but it's assumed by at least NGINX. */
- ssl->s3->new_session->verify_result = X509_V_OK;
+ hs->new_session->verify_result = X509_V_OK;
ssl->s3->tmp.reuse_message = 1;
return 1;
}
@@ -1456,29 +1430,28 @@ static int ssl3_get_client_certificate(SSL_HANDSHAKE *hs) {
CBS certificate_msg;
CBS_init(&certificate_msg, ssl->init_msg, ssl->init_num);
- sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free);
+ sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
EVP_PKEY_free(hs->peer_pubkey);
hs->peer_pubkey = NULL;
uint8_t alert = SSL_AD_DECODE_ERROR;
- ssl->s3->new_session->certs =
- ssl_parse_cert_chain(&alert, &hs->peer_pubkey,
- ssl->retain_only_sha256_of_client_certs
- ? ssl->s3->new_session->peer_sha256
- : NULL,
- &certificate_msg, ssl->ctx->pool);
- if (ssl->s3->new_session->certs == NULL) {
+ hs->new_session->certs = ssl_parse_cert_chain(
+ &alert, &hs->peer_pubkey,
+ ssl->retain_only_sha256_of_client_certs ? hs->new_session->peer_sha256
+ : NULL,
+ &certificate_msg, ssl->ctx->pool);
+ if (hs->new_session->certs == NULL) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, alert);
return -1;
}
if (CBS_len(&certificate_msg) != 0 ||
- !ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) {
+ !ssl->ctx->x509_method->session_cache_objects(hs->new_session)) {
OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR);
return -1;
}
- if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) {
+ if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) {
/* No client certificate so the handshake buffer may be discarded. */
SSL_TRANSCRIPT_free_buffer(&hs->transcript);
@@ -1499,17 +1472,17 @@ static int ssl3_get_client_certificate(SSL_HANDSHAKE *hs) {
/* OpenSSL returns X509_V_OK when no certificates are received. This is
* classed by them as a bug, but it's assumed by at least NGINX. */
- ssl->s3->new_session->verify_result = X509_V_OK;
+ hs->new_session->verify_result = X509_V_OK;
return 1;
}
/* The hash will have been filled in. */
if (ssl->retain_only_sha256_of_client_certs) {
- ssl->s3->new_session->peer_sha256_valid = 1;
+ hs->new_session->peer_sha256_valid = 1;
}
- if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result,
- ssl->s3->new_session->x509_chain)) {
+ if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result,
+ hs->new_session->x509_chain)) {
return -1;
}
return 1;
@@ -1541,8 +1514,8 @@ static int ssl3_get_client_key_exchange(SSL_HANDSHAKE *hs) {
}
CBS_init(&client_key_exchange, ssl->init_msg, ssl->init_num);
- alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
- alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+ alg_k = hs->new_cipher->algorithm_mkey;
+ alg_a = hs->new_cipher->algorithm_auth;
/* If using a PSK key exchange, prepare the pre-shared key. */
if (alg_a & SSL_aPSK) {
@@ -1570,15 +1543,15 @@ static int ssl3_get_client_key_exchange(SSL_HANDSHAKE *hs) {
goto f_err;
}
- if (!CBS_strdup(&psk_identity, &ssl->s3->new_session->psk_identity)) {
+ if (!CBS_strdup(&psk_identity, &hs->new_session->psk_identity)) {
al = SSL_AD_INTERNAL_ERROR;
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
goto f_err;
}
/* Look up the key for the identity. */
- psk_len = ssl->psk_server_callback(ssl, ssl->s3->new_session->psk_identity,
- psk, sizeof(psk));
+ psk_len = ssl->psk_server_callback(ssl, hs->new_session->psk_identity, psk,
+ sizeof(psk));
if (psk_len > PSK_MAX_PSK_LEN) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
al = SSL_AD_INTERNAL_ERROR;
@@ -1763,14 +1736,12 @@ static int ssl3_get_client_key_exchange(SSL_HANDSHAKE *hs) {
}
/* Compute the master secret */
- ssl->s3->new_session->master_key_length =
- tls1_generate_master_secret(hs, ssl->s3->new_session->master_key,
- premaster_secret, premaster_secret_len);
- if (ssl->s3->new_session->master_key_length == 0) {
+ hs->new_session->master_key_length = tls1_generate_master_secret(
+ hs, hs->new_session->master_key, premaster_secret, premaster_secret_len);
+ if (hs->new_session->master_key_length == 0) {
goto err;
}
- ssl->s3->new_session->extended_master_secret =
- ssl->s3->tmp.extended_master_secret;
+ hs->new_session->extended_master_secret = hs->extended_master_secret;
OPENSSL_cleanse(premaster_secret, premaster_secret_len);
OPENSSL_free(premaster_secret);
@@ -1823,7 +1794,7 @@ static int ssl3_get_cert_verify(SSL_HANDSHAKE *hs) {
if (!tls12_check_peer_sigalg(ssl, &al, signature_algorithm)) {
goto f_err;
}
- ssl->s3->new_session->peer_signature_algorithm = signature_algorithm;
+ hs->new_session->peer_signature_algorithm = signature_algorithm;
} else if (hs->peer_pubkey->type == EVP_PKEY_RSA) {
signature_algorithm = SSL_SIGN_RSA_PKCS1_MD5_SHA1;
} else if (hs->peer_pubkey->type == EVP_PKEY_EC) {
@@ -1849,7 +1820,7 @@ static int ssl3_get_cert_verify(SSL_HANDSHAKE *hs) {
uint8_t digest[EVP_MAX_MD_SIZE];
size_t digest_len;
if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(&hs->transcript, digest,
- &digest_len, ssl->s3->new_session,
+ &digest_len, hs->new_session,
signature_algorithm)) {
goto err;
}
@@ -1946,8 +1917,8 @@ static int ssl3_send_new_session_ticket(SSL_HANDSHAKE *hs) {
SSL_SESSION *session_copy = NULL;
if (ssl->session == NULL) {
/* Fix the timeout to measure from the ticket issuance time. */
- ssl_session_rebase_time(ssl, ssl->s3->new_session);
- session = ssl->s3->new_session;
+ ssl_session_rebase_time(ssl, hs->new_session);
+ session = hs->new_session;
} else {
/* We are renewing an existing session. Duplicate the session to adjust the
* timeout. */
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index 5b93f475..b2c9fcd4 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -854,9 +854,9 @@ STACK_OF(X509_NAME) *
int ssl_add_client_CA_list(SSL *ssl, CBB *cbb);
/* ssl_check_leaf_certificate returns one if |pkey| and |leaf| are suitable as
- * a server's leaf certificate for |ssl|. Otherwise, it returns zero and pushes
+ * a server's leaf certificate for |hs|. Otherwise, it returns zero and pushes
* an error on the error queue. */
-int ssl_check_leaf_certificate(SSL *ssl, EVP_PKEY *pkey,
+int ssl_check_leaf_certificate(SSL_HANDSHAKE *hs, EVP_PKEY *pkey,
const CRYPTO_BUFFER *leaf);
@@ -1049,6 +1049,13 @@ struct ssl_handshake_st {
/* peer_pubkey is the public key parsed from the peer's leaf certificate. */
EVP_PKEY *peer_pubkey;
+ /* new_session is the new mutable session being established by the current
+ * handshake. It should not be cached. */
+ SSL_SESSION *new_session;
+
+ /* new_cipher is the cipher being negotiated in this handshake. */
+ const SSL_CIPHER *new_cipher;
+
/* key_block is the record-layer key block for TLS 1.2 and earlier. */
uint8_t *key_block;
uint8_t key_block_len;
@@ -1100,6 +1107,10 @@ struct ssl_handshake_st {
/* v2_clienthello is one if we received a V2ClientHello. */
unsigned v2_clienthello:1;
+ /* extended_master_secret is one if the extended master secret extension is
+ * negotiated in this handshake. */
+ unsigned extended_master_secret:1;
+
/* client_version is the value sent or received in the ClientHello version. */
uint16_t client_version;
} /* SSL_HANDSHAKE */;
@@ -1323,6 +1334,17 @@ typedef struct cert_st {
/* Optional X509_STORE for certificate validation. If NULL the parent SSL_CTX
* store is used instead. */
X509_STORE *verify_store;
+
+ /* Signed certificate timestamp list to be sent to the client, if requested */
+ CRYPTO_BUFFER *signed_cert_timestamp_list;
+
+ /* OCSP response to be sent to the client, if requested. */
+ CRYPTO_BUFFER *ocsp_response;
+
+ /* sid_ctx partitions the session space within a shared session cache or
+ * ticket key. Only sessions with a matching value will be accepted. */
+ uint8_t sid_ctx_length;
+ uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
} CERT;
/* SSL_METHOD is a compatibility structure to support the legacy version-locked
@@ -1594,9 +1616,6 @@ typedef struct ssl3_state_st {
* TODO(davidben): Move everything not needed after the handshake completes to
* |hs| and remove this. */
struct {
- /* used to hold the new cipher we are going to use */
- const SSL_CIPHER *new_cipher;
-
int message_type;
int reuse_message;
@@ -1604,20 +1623,8 @@ typedef struct ssl3_state_st {
uint8_t new_mac_secret_len;
uint8_t new_key_len;
uint8_t new_fixed_iv_len;
-
- /* extended_master_secret indicates whether the extended master secret
- * computation is used in this handshake. Note that this is different from
- * whether it was used for the current session. If this is a resumption
- * handshake then EMS might be negotiated in the client and server hello
- * messages, but it doesn't matter if the session that's being resumed
- * didn't use it to create the master secret initially. */
- char extended_master_secret;
} tmp;
- /* new_session is the new mutable session being established by the current
- * handshake. It should not be cached. */
- SSL_SESSION *new_session;
-
/* established_session is the session established by the connection. This
* session is only filled upon the completion of the handshake and is
* immutable. */
@@ -1798,11 +1805,6 @@ struct ssl_st {
* milliseconds. It's used to initialize the timer any time it's restarted. */
unsigned initial_timeout_duration_ms;
- /* the session_id_context is used to ensure sessions are only reused
- * in the appropriate context */
- uint8_t sid_ctx_length;
- uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
-
/* session is the configured session to be offered by the client. This session
* is immutable. */
SSL_SESSION *session;
@@ -1887,12 +1889,6 @@ struct ssl_st {
* hash of the peer's certificate and then discard it to save memory and
* session space. Only effective on the server side. */
unsigned retain_only_sha256_of_client_certs:1;
-
- /* Signed certificate timestamp list to be sent to the client, if requested */
- CRYPTO_BUFFER *signed_cert_timestamp_list;
-
- /* OCSP response to be sent to the client, if requested. */
- CRYPTO_BUFFER *ocsp_response;
};
/* From draft-ietf-tls-tls13-18, used in determining PSK modes. */
@@ -1936,9 +1932,10 @@ int ssl_session_is_context_valid(const SSL *ssl, const SSL_SESSION *session);
* it has expired. */
int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session);
-/* ssl_session_is_resumable returns one if |session| is resumable for |ssl| and
+/* ssl_session_is_resumable returns one if |session| is resumable for |hs| and
* zero otherwise. */
-int ssl_session_is_resumable(const SSL *ssl, const SSL_SESSION *session);
+int ssl_session_is_resumable(const SSL_HANDSHAKE *hs,
+ const SSL_SESSION *session);
/* SSL_SESSION_get_digest returns the digest used in |session|. If the digest is
* invalid, it returns NULL. */
diff --git a/src/ssl/s3_both.c b/src/ssl/s3_both.c
index d3f9421b..7fd09c65 100644
--- a/src/ssl/s3_both.c
+++ b/src/ssl/s3_both.c
@@ -167,6 +167,7 @@ void ssl_handshake_free(SSL_HANDSHAKE *hs) {
OPENSSL_free(hs->cookie);
OPENSSL_free(hs->key_share_bytes);
OPENSSL_free(hs->public_key);
+ SSL_SESSION_free(hs->new_session);
OPENSSL_free(hs->peer_sigalgs);
OPENSSL_free(hs->peer_supported_group_list);
OPENSSL_free(hs->peer_key);
@@ -678,7 +679,6 @@ static int read_v2_client_hello(SSL *ssl) {
}
int ssl3_get_message(SSL *ssl) {
-again:
/* Re-create the handshake buffer if needed. */
if (ssl->init_buf == NULL) {
ssl->init_buf = BUF_MEM_new();
@@ -733,16 +733,6 @@ again:
ssl->s3->tmp.message_type = ((const uint8_t *)ssl->init_buf->data)[0];
ssl->init_msg = (uint8_t*)ssl->init_buf->data + SSL3_HM_HEADER_LENGTH;
ssl->init_num = ssl->init_buf->length - SSL3_HM_HEADER_LENGTH;
-
- /* Ignore stray HelloRequest messages in the handshake before TLS 1.3. Per RFC
- * 5246, section 7.4.1.1, the server may send HelloRequest at any time. */
- if (!ssl->server && SSL_in_init(ssl) &&
- (!ssl->s3->have_version || ssl3_protocol_version(ssl) < TLS1_3_VERSION) &&
- ssl->s3->tmp.message_type == SSL3_MT_HELLO_REQUEST &&
- ssl->init_num == 0) {
- goto again;
- }
-
return 1;
}
diff --git a/src/ssl/s3_lib.c b/src/ssl/s3_lib.c
index 1c723cd2..57a27c70 100644
--- a/src/ssl/s3_lib.c
+++ b/src/ssl/s3_lib.c
@@ -197,7 +197,6 @@ void ssl3_free(SSL *ssl) {
ssl_read_buffer_clear(ssl);
ssl_write_buffer_clear(ssl);
- SSL_SESSION_free(ssl->s3->new_session);
SSL_SESSION_free(ssl->s3->established_session);
ssl_handshake_free(ssl->s3->hs);
OPENSSL_free(ssl->s3->next_proto_negotiated);
diff --git a/src/ssl/ssl_asn1.c b/src/ssl/ssl_asn1.c
index 3582864e..3533225a 100644
--- a/src/ssl/ssl_asn1.c
+++ b/src/ssl/ssl_asn1.c
@@ -130,6 +130,7 @@
* peerSignatureAlgorithm [23] INTEGER OPTIONAL,
* ticketMaxEarlyData [24] INTEGER OPTIONAL,
* authTimeout [25] INTEGER OPTIONAL, -- defaults to timeout
+ * earlyALPN [26] OCTET STRING OPTIONAL,
* }
*
* Note: historically this serialization has included other optional
@@ -186,6 +187,8 @@ static const int kTicketMaxEarlyDataTag =
CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 24;
static const int kAuthTimeoutTag =
CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 25;
+static const int kEarlyALPNTag =
+ CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 26;
static int SSL_SESSION_to_bytes_full(const SSL_SESSION *in, uint8_t **out_data,
size_t *out_len, int for_ticket) {
@@ -412,6 +415,16 @@ static int SSL_SESSION_to_bytes_full(const SSL_SESSION *in, uint8_t **out_data,
goto err;
}
+ if (in->early_alpn) {
+ if (!CBB_add_asn1(&session, &child, kEarlyALPNTag) ||
+ !CBB_add_asn1(&child, &child2, CBS_ASN1_OCTETSTRING) ||
+ !CBB_add_bytes(&child2, (const uint8_t *)in->early_alpn,
+ in->early_alpn_len)) {
+ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+ goto err;
+ }
+ }
+
if (!CBB_finish(&cbb, out_data, out_len)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
goto err;
@@ -800,6 +813,8 @@ SSL_SESSION *SSL_SESSION_parse(CBS *cbs, const SSL_X509_METHOD *x509_method,
kTicketMaxEarlyDataTag, 0) ||
!SSL_SESSION_parse_long(&session, &ret->auth_timeout, kAuthTimeoutTag,
ret->timeout) ||
+ !SSL_SESSION_parse_octet_string(&session, &ret->early_alpn,
+ &ret->early_alpn_len, kEarlyALPNTag) ||
CBS_len(&session) != 0) {
OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION);
goto err;
diff --git a/src/ssl/ssl_cert.c b/src/ssl/ssl_cert.c
index 4177a482..c60c6fa2 100644
--- a/src/ssl/ssl_cert.c
+++ b/src/ssl/ssl_cert.c
@@ -203,6 +203,19 @@ CERT *ssl_cert_dup(CERT *cert) {
ret->verify_store = cert->verify_store;
}
+ if (cert->signed_cert_timestamp_list != NULL) {
+ CRYPTO_BUFFER_up_ref(cert->signed_cert_timestamp_list);
+ ret->signed_cert_timestamp_list = cert->signed_cert_timestamp_list;
+ }
+
+ if (cert->ocsp_response != NULL) {
+ CRYPTO_BUFFER_up_ref(cert->ocsp_response);
+ ret->ocsp_response = cert->ocsp_response;
+ }
+
+ ret->sid_ctx_length = cert->sid_ctx_length;
+ OPENSSL_memcpy(ret->sid_ctx, cert->sid_ctx, sizeof(ret->sid_ctx));
+
return ret;
err:
@@ -235,6 +248,8 @@ void ssl_cert_free(CERT *c) {
ssl_cert_clear_certs(c);
OPENSSL_free(c->sigalgs);
X509_STORE_free(c->verify_store);
+ CRYPTO_BUFFER_free(c->signed_cert_timestamp_list);
+ CRYPTO_BUFFER_free(c->ocsp_response);
OPENSSL_free(c);
}
@@ -883,20 +898,20 @@ void SSL_set_cert_cb(SSL *ssl, int (*cb)(SSL *ssl, void *arg), void *arg) {
ssl_cert_set_cert_cb(ssl->cert, cb, arg);
}
-int ssl_check_leaf_certificate(SSL *ssl, EVP_PKEY *pkey,
+int ssl_check_leaf_certificate(SSL_HANDSHAKE *hs, EVP_PKEY *pkey,
const CRYPTO_BUFFER *leaf) {
+ SSL *const ssl = hs->ssl;
assert(ssl3_protocol_version(ssl) < TLS1_3_VERSION);
/* Check the certificate's type matches the cipher. */
- const SSL_CIPHER *cipher = ssl->s3->tmp.new_cipher;
- int expected_type = ssl_cipher_get_key_type(cipher);
+ int expected_type = ssl_cipher_get_key_type(hs->new_cipher);
assert(expected_type != EVP_PKEY_NONE);
if (pkey->type != expected_type) {
OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CERTIFICATE_TYPE);
return 0;
}
- if (cipher->algorithm_auth & SSL_aECDSA) {
+ if (hs->new_cipher->algorithm_auth & SSL_aECDSA) {
CBS leaf_cbs;
CBS_init(&leaf_cbs, CRYPTO_BUFFER_data(leaf), CRYPTO_BUFFER_len(leaf));
/* ECDSA and ECDH certificates use the same public key format. Instead,
@@ -956,3 +971,42 @@ void SSL_CTX_set_client_cert_cb(SSL_CTX *ctx, int (*cb)(SSL *ssl,
SSL_CTX_set_cert_cb(ctx, do_client_cert_cb, NULL);
ctx->client_cert_cb = cb;
}
+
+static int set_signed_cert_timestamp_list(CERT *cert, const uint8_t *list,
+ size_t list_len) {
+ CBS sct_list;
+ CBS_init(&sct_list, list, list_len);
+ if (!ssl_is_sct_list_valid(&sct_list)) {
+ OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST);
+ return 0;
+ }
+
+ CRYPTO_BUFFER_free(cert->signed_cert_timestamp_list);
+ cert->signed_cert_timestamp_list =
+ CRYPTO_BUFFER_new(CBS_data(&sct_list), CBS_len(&sct_list), NULL);
+ return cert->signed_cert_timestamp_list != NULL;
+}
+
+int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list,
+ size_t list_len) {
+ return set_signed_cert_timestamp_list(ctx->cert, list, list_len);
+}
+
+int SSL_set_signed_cert_timestamp_list(SSL *ssl, const uint8_t *list,
+ size_t list_len) {
+ return set_signed_cert_timestamp_list(ssl->cert, list, list_len);
+}
+
+int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response,
+ size_t response_len) {
+ CRYPTO_BUFFER_free(ctx->cert->ocsp_response);
+ ctx->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
+ return ctx->cert->ocsp_response != NULL;
+}
+
+int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response,
+ size_t response_len) {
+ CRYPTO_BUFFER_free(ssl->cert->ocsp_response);
+ ssl->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
+ return ssl->cert->ocsp_response != NULL;
+}
diff --git a/src/ssl/ssl_lib.c b/src/ssl/ssl_lib.c
index c946b77b..d0151bb5 100644
--- a/src/ssl/ssl_lib.c
+++ b/src/ssl/ssl_lib.c
@@ -363,8 +363,6 @@ void SSL_CTX_free(SSL_CTX *ctx) {
OPENSSL_free(ctx->psk_identity_hint);
OPENSSL_free(ctx->supported_group_list);
OPENSSL_free(ctx->alpn_client_proto_list);
- CRYPTO_BUFFER_free(ctx->signed_cert_timestamp_list);
- CRYPTO_BUFFER_free(ctx->ocsp_response);
EVP_PKEY_free(ctx->tlsext_channel_id_private);
OPENSSL_free(ctx);
@@ -405,9 +403,6 @@ SSL *SSL_new(SSL_CTX *ctx) {
ssl->msg_callback = ctx->msg_callback;
ssl->msg_callback_arg = ctx->msg_callback_arg;
ssl->verify_mode = ctx->verify_mode;
- ssl->sid_ctx_length = ctx->sid_ctx_length;
- assert(ssl->sid_ctx_length <= sizeof ssl->sid_ctx);
- OPENSSL_memcpy(&ssl->sid_ctx, &ctx->sid_ctx, sizeof(ssl->sid_ctx));
ssl->verify_callback = ctx->default_verify_callback;
ssl->retain_only_sha256_of_client_certs =
ctx->retain_only_sha256_of_client_certs;
@@ -472,18 +467,6 @@ SSL *SSL_new(SSL_CTX *ctx) {
ssl->signed_cert_timestamps_enabled = ctx->signed_cert_timestamps_enabled;
ssl->ocsp_stapling_enabled = ctx->ocsp_stapling_enabled;
- /* If the context has an SCT list, use it. */
- if (ctx->signed_cert_timestamp_list != NULL) {
- CRYPTO_BUFFER_up_ref(ctx->signed_cert_timestamp_list);
- ssl->signed_cert_timestamp_list = ctx->signed_cert_timestamp_list;
- }
-
- /* If the context has an OCSP response, use it. */
- if (ctx->ocsp_response != NULL) {
- CRYPTO_BUFFER_up_ref(ctx->ocsp_response);
- ssl->ocsp_response = ctx->ocsp_response;
- }
-
return ssl;
err:
@@ -522,8 +505,6 @@ void SSL_free(SSL *ssl) {
OPENSSL_free(ssl->psk_identity_hint);
sk_X509_NAME_pop_free(ssl->client_CA, X509_NAME_free);
sk_SRTP_PROTECTION_PROFILE_free(ssl->srtp_profiles);
- CRYPTO_BUFFER_free(ssl->signed_cert_timestamp_list);
- CRYPTO_BUFFER_free(ssl->ocsp_response);
if (ssl->method != NULL) {
ssl->method->ssl_free(ssl);
@@ -800,10 +781,11 @@ int SSL_shutdown(SSL *ssl) {
return -1;
}
- /* We can't shutdown properly if we are in the middle of a handshake. */
+ /* If we are in the middle of a handshake, silently succeed. Consumers often
+ * call this function before |SSL_free|, whether the handshake succeeded or
+ * not. We assume the caller has already handled failed handshakes. */
if (SSL_in_init(ssl)) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_SHUTDOWN_WHILE_IN_INIT);
- return -1;
+ return 1;
}
if (ssl->quiet_shutdown) {
@@ -1088,37 +1070,32 @@ err:
return 0;
}
-int SSL_CTX_set_session_id_context(SSL_CTX *ctx, const uint8_t *sid_ctx,
+static int set_session_id_context(CERT *cert, const uint8_t *sid_ctx,
size_t sid_ctx_len) {
- if (sid_ctx_len > sizeof(ctx->sid_ctx)) {
+ if (sid_ctx_len > sizeof(cert->sid_ctx)) {
OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
return 0;
}
- assert(sizeof(ctx->sid_ctx) < 256);
- ctx->sid_ctx_length = (uint8_t)sid_ctx_len;
- OPENSSL_memcpy(ctx->sid_ctx, sid_ctx, sid_ctx_len);
-
+ OPENSSL_COMPILE_ASSERT(sizeof(cert->sid_ctx) < 256, sid_ctx_too_large);
+ cert->sid_ctx_length = (uint8_t)sid_ctx_len;
+ OPENSSL_memcpy(cert->sid_ctx, sid_ctx, sid_ctx_len);
return 1;
}
+int SSL_CTX_set_session_id_context(SSL_CTX *ctx, const uint8_t *sid_ctx,
+ size_t sid_ctx_len) {
+ return set_session_id_context(ctx->cert, sid_ctx, sid_ctx_len);
+}
+
int SSL_set_session_id_context(SSL *ssl, const uint8_t *sid_ctx,
size_t sid_ctx_len) {
- if (sid_ctx_len > sizeof(ssl->sid_ctx)) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
- return 0;
- }
-
- assert(sizeof(ssl->sid_ctx) < 256);
- ssl->sid_ctx_length = (uint8_t)sid_ctx_len;
- OPENSSL_memcpy(ssl->sid_ctx, sid_ctx, sid_ctx_len);
-
- return 1;
+ return set_session_id_context(ssl->cert, sid_ctx, sid_ctx_len);
}
const uint8_t *SSL_get0_session_id_context(const SSL *ssl, size_t *out_len) {
- *out_len = ssl->sid_ctx_length;
- return ssl->sid_ctx;
+ *out_len = ssl->cert->sid_ctx_length;
+ return ssl->cert->sid_ctx;
}
void ssl_cipher_preference_list_free(
@@ -1247,11 +1224,26 @@ size_t SSL_get_peer_finished(const SSL *ssl, void *buf, size_t count) {
int SSL_get_verify_mode(const SSL *ssl) { return ssl->verify_mode; }
int SSL_get_extms_support(const SSL *ssl) {
+ /* TLS 1.3 does not require extended master secret and always reports as
+ * supporting it. */
if (!ssl->s3->have_version) {
return 0;
}
- return ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
- ssl->s3->tmp.extended_master_secret == 1;
+ if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION) {
+ return 1;
+ }
+
+ /* If the initial handshake completed, query the established session. */
+ if (ssl->s3->established_session != NULL) {
+ return ssl->s3->established_session->extended_master_secret;
+ }
+
+ /* Otherwise, query the in-progress handshake. */
+ if (ssl->s3->hs != NULL) {
+ return ssl->s3->hs->extended_master_secret;
+ }
+ assert(0);
+ return 0;
}
int SSL_CTX_get_read_ahead(const SSL_CTX *ctx) { return 0; }
@@ -1583,18 +1575,16 @@ void SSL_CTX_enable_signed_cert_timestamps(SSL_CTX *ctx) {
ctx->signed_cert_timestamps_enabled = 1;
}
-int SSL_enable_signed_cert_timestamps(SSL *ssl) {
+void SSL_enable_signed_cert_timestamps(SSL *ssl) {
ssl->signed_cert_timestamps_enabled = 1;
- return 1;
}
void SSL_CTX_enable_ocsp_stapling(SSL_CTX *ctx) {
ctx->ocsp_stapling_enabled = 1;
}
-int SSL_enable_ocsp_stapling(SSL *ssl) {
+void SSL_enable_ocsp_stapling(SSL *ssl) {
ssl->ocsp_stapling_enabled = 1;
- return 1;
}
void SSL_get0_signed_cert_timestamp_list(const SSL *ssl, const uint8_t **out,
@@ -1624,52 +1614,6 @@ void SSL_get0_ocsp_response(const SSL *ssl, const uint8_t **out,
*out_len = session->ocsp_response_length;
}
-int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list,
- size_t list_len) {
- CBS sct_list;
- CBS_init(&sct_list, list, list_len);
- if (!ssl_is_sct_list_valid(&sct_list)) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST);
- return 0;
- }
-
- CRYPTO_BUFFER_free(ctx->signed_cert_timestamp_list);
- ctx->signed_cert_timestamp_list = CRYPTO_BUFFER_new(CBS_data(&sct_list),
- CBS_len(&sct_list),
- NULL);
- return ctx->signed_cert_timestamp_list != NULL;
-}
-
-int SSL_set_signed_cert_timestamp_list(SSL *ssl, const uint8_t *list,
- size_t list_len) {
- CBS sct_list;
- CBS_init(&sct_list, list, list_len);
- if (!ssl_is_sct_list_valid(&sct_list)) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST);
- return 0;
- }
-
- CRYPTO_BUFFER_free(ssl->signed_cert_timestamp_list);
- ssl->signed_cert_timestamp_list = CRYPTO_BUFFER_new(CBS_data(&sct_list),
- CBS_len(&sct_list),
- NULL);
- return ssl->signed_cert_timestamp_list != NULL;
-}
-
-int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response,
- size_t response_len) {
- CRYPTO_BUFFER_free(ctx->ocsp_response);
- ctx->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
- return ctx->ocsp_response != NULL;
-}
-
-int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response,
- size_t response_len) {
- CRYPTO_BUFFER_free(ssl->ocsp_response);
- ssl->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
- return ssl->ocsp_response != NULL;
-}
-
int SSL_set_tlsext_host_name(SSL *ssl, const char *name) {
OPENSSL_free(ssl->tlsext_hostname);
ssl->tlsext_hostname = NULL;
@@ -2076,10 +2020,6 @@ SSL_CTX *SSL_set_SSL_CTX(SSL *ssl, SSL_CTX *ctx) {
SSL_CTX_free(ssl->ctx);
ssl->ctx = ctx;
- ssl->sid_ctx_length = ctx->sid_ctx_length;
- assert(ssl->sid_ctx_length <= sizeof(ssl->sid_ctx));
- OPENSSL_memcpy(ssl->sid_ctx, ctx->sid_ctx, sizeof(ssl->sid_ctx));
-
return ssl->ctx;
}
@@ -2094,12 +2034,7 @@ void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl, int type,
}
int SSL_state(const SSL *ssl) {
- if (ssl->s3->hs == NULL) {
- assert(ssl->s3->initial_handshake_complete);
- return SSL_ST_OK;
- }
-
- return ssl->s3->hs->state;
+ return SSL_in_init(ssl) ? SSL_ST_INIT : SSL_ST_OK;
}
void SSL_set_state(SSL *ssl, int state) { }
@@ -2345,11 +2280,12 @@ int ssl_log_secret(const SSL *ssl, const char *label, const uint8_t *secret,
}
int SSL_is_init_finished(const SSL *ssl) {
- return SSL_state(ssl) == SSL_ST_OK;
+ return !SSL_in_init(ssl);
}
int SSL_in_init(const SSL *ssl) {
- return (SSL_state(ssl) & SSL_ST_INIT) != 0;
+ SSL_HANDSHAKE *hs = ssl->s3->hs;
+ return hs != NULL && hs->state != SSL_ST_OK;
}
int SSL_in_false_start(const SSL *ssl) {
@@ -2575,10 +2511,11 @@ size_t SSL_get_server_random(const SSL *ssl, uint8_t *out, size_t max_out) {
}
const SSL_CIPHER *SSL_get_pending_cipher(const SSL *ssl) {
- if (!SSL_in_init(ssl)) {
+ SSL_HANDSHAKE *hs = ssl->s3->hs;
+ if (hs == NULL) {
return NULL;
}
- return ssl->s3->tmp.new_cipher;
+ return hs->new_cipher;
}
void SSL_set_retain_only_sha256_of_client_certs(SSL *ssl, int enabled) {
diff --git a/src/ssl/ssl_session.c b/src/ssl/ssl_session.c
index b71b994c..bbe88c36 100644
--- a/src/ssl/ssl_session.c
+++ b/src/ssl/ssl_session.c
@@ -280,6 +280,15 @@ SSL_SESSION *SSL_SESSION_dup(SSL_SESSION *session, int dup_flags) {
new_session->ticket_age_add = session->ticket_age_add;
new_session->ticket_max_early_data = session->ticket_max_early_data;
new_session->extended_master_secret = session->extended_master_secret;
+
+ if (session->early_alpn != NULL) {
+ new_session->early_alpn =
+ BUF_memdup(session->early_alpn, session->early_alpn_len);
+ if (new_session->early_alpn == NULL) {
+ goto err;
+ }
+ }
+ new_session->early_alpn_len = session->early_alpn_len;
}
/* Copy the ticket. */
@@ -373,6 +382,7 @@ void SSL_SESSION_free(SSL_SESSION *session) {
OPENSSL_free(session->tlsext_signed_cert_timestamp_list);
OPENSSL_free(session->ocsp_response);
OPENSSL_free(session->psk_identity);
+ OPENSSL_free(session->early_alpn);
OPENSSL_cleanse(session, sizeof(*session));
OPENSSL_free(session);
}
@@ -458,8 +468,8 @@ SSL_SESSION *SSL_get_session(const SSL *ssl) {
if (!SSL_in_init(ssl)) {
return ssl->s3->established_session;
}
- if (ssl->s3->new_session != NULL) {
- return ssl->s3->new_session;
+ if (ssl->s3->hs->new_session != NULL) {
+ return ssl->s3->hs->new_session;
}
return ssl->session;
}
@@ -550,19 +560,20 @@ int ssl_get_new_session(SSL_HANDSHAKE *hs, int is_server) {
session->session_id_length = 0;
}
- if (ssl->sid_ctx_length > sizeof(session->sid_ctx)) {
+ if (ssl->cert->sid_ctx_length > sizeof(session->sid_ctx)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
goto err;
}
- OPENSSL_memcpy(session->sid_ctx, ssl->sid_ctx, ssl->sid_ctx_length);
- session->sid_ctx_length = ssl->sid_ctx_length;
+ OPENSSL_memcpy(session->sid_ctx, ssl->cert->sid_ctx,
+ ssl->cert->sid_ctx_length);
+ session->sid_ctx_length = ssl->cert->sid_ctx_length;
/* The session is marked not resumable until it is completely filled in. */
session->not_resumable = 1;
session->verify_result = X509_V_ERR_INVALID_CALL;
- SSL_SESSION_free(ssl->s3->new_session);
- ssl->s3->new_session = session;
+ SSL_SESSION_free(hs->new_session);
+ hs->new_session = session;
ssl_set_session(ssl, NULL);
return 1;
@@ -668,9 +679,9 @@ int ssl_session_is_context_valid(const SSL *ssl, const SSL_SESSION *session) {
return 0;
}
- return session->sid_ctx_length == ssl->sid_ctx_length &&
- OPENSSL_memcmp(session->sid_ctx, ssl->sid_ctx, ssl->sid_ctx_length) ==
- 0;
+ return session->sid_ctx_length == ssl->cert->sid_ctx_length &&
+ OPENSSL_memcmp(session->sid_ctx, ssl->cert->sid_ctx,
+ ssl->cert->sid_ctx_length) == 0;
}
int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session) {
@@ -689,18 +700,20 @@ int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session) {
return session->timeout > (long)now.tv_sec - session->time;
}
-int ssl_session_is_resumable(const SSL *ssl, const SSL_SESSION *session) {
+int ssl_session_is_resumable(const SSL_HANDSHAKE *hs,
+ const SSL_SESSION *session) {
+ const SSL *const ssl = hs->ssl;
return ssl_session_is_context_valid(ssl, session) &&
/* The session must have been created by the same type of end point as
* we're now using it with. */
- session->is_server == ssl->server &&
+ ssl->server == session->is_server &&
/* The session must not be expired. */
ssl_session_is_time_valid(ssl, session) &&
/* Only resume if the session's version matches the negotiated
* version. */
ssl->version == session->ssl_version &&
/* Only resume if the session's cipher matches the negotiated one. */
- ssl->s3->tmp.new_cipher == session->cipher &&
+ hs->new_cipher == session->cipher &&
/* If the session contains a client certificate (either the full
* certificate or just the hash) then require that the form of the
* certificate matches the current configuration. */
@@ -898,7 +911,9 @@ static int remove_session_lock(SSL_CTX *ctx, SSL_SESSION *session, int lock) {
int SSL_set_session(SSL *ssl, SSL_SESSION *session) {
/* SSL_set_session may only be called before the handshake has started. */
- if (SSL_state(ssl) != SSL_ST_INIT || ssl->s3->initial_handshake_complete) {
+ if (ssl->s3->initial_handshake_complete ||
+ ssl->s3->hs == NULL ||
+ ssl->s3->hs->state != SSL_ST_INIT) {
abort();
}
diff --git a/src/ssl/ssl_stat.c b/src/ssl/ssl_stat.c
index 479288a2..571b4a9a 100644
--- a/src/ssl/ssl_stat.c
+++ b/src/ssl/ssl_stat.c
@@ -83,11 +83,22 @@
#include <openssl/ssl.h>
+#include <assert.h>
+
#include "internal.h"
+static int ssl_state(const SSL *ssl) {
+ if (ssl->s3->hs == NULL) {
+ assert(ssl->s3->initial_handshake_complete);
+ return SSL_ST_OK;
+ }
+
+ return ssl->s3->hs->state;
+}
+
const char *SSL_state_string_long(const SSL *ssl) {
- switch (SSL_state(ssl)) {
+ switch (ssl_state(ssl)) {
case SSL_ST_ACCEPT:
return "before accept initialization";
@@ -203,7 +214,7 @@ const char *SSL_state_string_long(const SSL *ssl) {
}
const char *SSL_state_string(const SSL *ssl) {
- switch (SSL_state(ssl)) {
+ switch (ssl_state(ssl)) {
case SSL_ST_ACCEPT:
return "AINIT ";
diff --git a/src/ssl/ssl_test.cc b/src/ssl/ssl_test.cc
index dfab9769..4e0c2747 100644
--- a/src/ssl/ssl_test.cc
+++ b/src/ssl/ssl_test.cc
@@ -2415,6 +2415,9 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method,
// Test that switching the |SSL_CTX| at the SNI callback behaves correctly.
static const uint16_t kECDSAWithSHA256 = SSL_SIGN_ECDSA_SECP256R1_SHA256;
+ static const uint8_t kSCTList[] = {0, 6, 0, 4, 5, 6, 7, 8};
+ static const uint8_t kOCSPResponse[] = {1, 2, 3, 4};
+
bssl::UniquePtr<SSL_CTX> server_ctx(SSL_CTX_new(method));
bssl::UniquePtr<SSL_CTX> server_ctx2(SSL_CTX_new(method));
bssl::UniquePtr<SSL_CTX> client_ctx(SSL_CTX_new(method));
@@ -2423,6 +2426,10 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method,
!SSL_CTX_use_PrivateKey(server_ctx.get(), key.get()) ||
!SSL_CTX_use_certificate(server_ctx2.get(), cert2.get()) ||
!SSL_CTX_use_PrivateKey(server_ctx2.get(), key2.get()) ||
+ !SSL_CTX_set_signed_cert_timestamp_list(server_ctx2.get(), kSCTList,
+ sizeof(kSCTList)) ||
+ !SSL_CTX_set_ocsp_response(server_ctx2.get(), kOCSPResponse,
+ sizeof(kOCSPResponse)) ||
// Historically signing preferences would be lost in some cases with the
// SNI callback, which triggers the TLS 1.2 SHA-1 default. To ensure
// this doesn't happen when |version| is TLS 1.2, configure the private
@@ -2441,6 +2448,9 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method,
SSL_CTX_set_tlsext_servername_callback(server_ctx.get(), SwitchContext);
SSL_CTX_set_tlsext_servername_arg(server_ctx.get(), server_ctx2.get());
+ SSL_CTX_enable_signed_cert_timestamps(client_ctx.get());
+ SSL_CTX_enable_ocsp_stapling(client_ctx.get());
+
bssl::UniquePtr<SSL> client, server;
if (!ConnectClientAndServer(&client, &server, client_ctx.get(),
server_ctx.get(), nullptr)) {
@@ -2455,6 +2465,22 @@ static bool TestSNICallback(bool is_dtls, const SSL_METHOD *method,
return false;
}
+ // The client should have received |server_ctx2|'s SCT list.
+ const uint8_t *data;
+ size_t len;
+ SSL_get0_signed_cert_timestamp_list(client.get(), &data, &len);
+ if (Bytes(kSCTList) != Bytes(data, len)) {
+ fprintf(stderr, "Incorrect SCT list received.\n");
+ return false;
+ }
+
+ // The client should have received |server_ctx2|'s OCSP response.
+ SSL_get0_ocsp_response(client.get(), &data, &len);
+ if (Bytes(kOCSPResponse) != Bytes(data, len)) {
+ fprintf(stderr, "Incorrect OCSP response received.\n");
+ return false;
+ }
+
return true;
}
diff --git a/src/ssl/t1_enc.c b/src/ssl/t1_enc.c
index d01992e7..9f11e056 100644
--- a/src/ssl/t1_enc.c
+++ b/src/ssl/t1_enc.c
@@ -330,8 +330,8 @@ static int tls1_setup_key_block(SSL_HANDSHAKE *hs) {
}
SSL_SESSION *session = ssl->session;
- if (ssl->s3->new_session != NULL) {
- session = ssl->s3->new_session;
+ if (hs->new_session != NULL) {
+ session = hs->new_session;
}
const EVP_AEAD *aead = NULL;
@@ -427,10 +427,9 @@ int tls1_change_cipher_state(SSL_HANDSHAKE *hs, int which) {
iv = server_write_iv;
}
- SSL_AEAD_CTX *aead_ctx =
- SSL_AEAD_CTX_new(is_read ? evp_aead_open : evp_aead_seal,
- ssl3_protocol_version(ssl), ssl->s3->tmp.new_cipher, key,
- key_len, mac_secret, mac_secret_len, iv, iv_len);
+ SSL_AEAD_CTX *aead_ctx = SSL_AEAD_CTX_new(
+ is_read ? evp_aead_open : evp_aead_seal, ssl3_protocol_version(ssl),
+ hs->new_cipher, key, key_len, mac_secret, mac_secret_len, iv, iv_len);
if (aead_ctx == NULL) {
return 0;
}
@@ -474,7 +473,7 @@ int tls1_generate_master_secret(SSL_HANDSHAKE *hs, uint8_t *out,
const uint8_t *premaster,
size_t premaster_len) {
const SSL *ssl = hs->ssl;
- if (ssl->s3->tmp.extended_master_secret) {
+ if (hs->extended_master_secret) {
uint8_t digests[EVP_MAX_MD_SIZE];
size_t digests_len;
if (!SSL_TRANSCRIPT_get_hash(&hs->transcript, digests, &digests_len) ||
diff --git a/src/ssl/t1_lib.c b/src/ssl/t1_lib.c
index 7723ccd3..d6ef1ffd 100644
--- a/src/ssl/t1_lib.c
+++ b/src/ssl/t1_lib.c
@@ -616,9 +616,9 @@ static int ext_sni_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
assert(ssl->tlsext_hostname != NULL);
if (ssl->session == NULL) {
- OPENSSL_free(ssl->s3->new_session->tlsext_hostname);
- ssl->s3->new_session->tlsext_hostname = BUF_strdup(ssl->tlsext_hostname);
- if (!ssl->s3->new_session->tlsext_hostname) {
+ OPENSSL_free(hs->new_session->tlsext_hostname);
+ hs->new_session->tlsext_hostname = BUF_strdup(ssl->tlsext_hostname);
+ if (!hs->new_session->tlsext_hostname) {
*out_alert = SSL_AD_INTERNAL_ERROR;
return 0;
}
@@ -870,38 +870,32 @@ static int ext_ems_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
static int ext_ems_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
CBS *contents) {
SSL *const ssl = hs->ssl;
- /* Whether EMS is negotiated may not change on renegotation. */
- if (ssl->s3->initial_handshake_complete) {
- if ((contents != NULL) != ssl->s3->tmp.extended_master_secret) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_RENEGOTIATION_EMS_MISMATCH);
- *out_alert = SSL_AD_ILLEGAL_PARAMETER;
+
+ if (contents != NULL) {
+ if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
+ ssl->version == SSL3_VERSION ||
+ CBS_len(contents) != 0) {
return 0;
}
- return 1;
- }
-
- if (contents == NULL) {
- return 1;
- }
-
- if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
- ssl->version == SSL3_VERSION) {
- return 0;
+ hs->extended_master_secret = 1;
}
- if (CBS_len(contents) != 0) {
+ /* Whether EMS is negotiated may not change on renegotiation. */
+ if (ssl->s3->established_session != NULL &&
+ hs->extended_master_secret !=
+ ssl->s3->established_session->extended_master_secret) {
+ OPENSSL_PUT_ERROR(SSL, SSL_R_RENEGOTIATION_EMS_MISMATCH);
+ *out_alert = SSL_AD_ILLEGAL_PARAMETER;
return 0;
}
- ssl->s3->tmp.extended_master_secret = 1;
return 1;
}
static int ext_ems_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
CBS *contents) {
- SSL *const ssl = hs->ssl;
- uint16_t version = ssl3_protocol_version(ssl);
+ uint16_t version = ssl3_protocol_version(hs->ssl);
if (version >= TLS1_3_VERSION ||
version == SSL3_VERSION) {
return 1;
@@ -915,12 +909,12 @@ static int ext_ems_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
return 0;
}
- ssl->s3->tmp.extended_master_secret = 1;
+ hs->extended_master_secret = 1;
return 1;
}
static int ext_ems_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
- if (!hs->ssl->s3->tmp.extended_master_secret) {
+ if (!hs->extended_master_secret) {
return 1;
}
@@ -1118,7 +1112,7 @@ static int ext_ocsp_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
/* OCSP stapling is forbidden on non-certificate ciphers. */
if (CBS_len(contents) != 0 ||
- !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ !ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
return 0;
}
@@ -1152,9 +1146,9 @@ static int ext_ocsp_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
SSL *const ssl = hs->ssl;
if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
!hs->ocsp_stapling_requested ||
- ssl->ocsp_response == NULL ||
+ ssl->cert->ocsp_response == NULL ||
ssl->s3->session_reused ||
- !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+ !ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
return 1;
}
@@ -1341,10 +1335,8 @@ static int ext_sct_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
*
* TODO(davidben): Enforce this anyway. */
if (!ssl->s3->session_reused &&
- !CBS_stow(
- contents,
- &ssl->s3->new_session->tlsext_signed_cert_timestamp_list,
- &ssl->s3->new_session->tlsext_signed_cert_timestamp_list_length)) {
+ !CBS_stow(contents, &hs->new_session->tlsext_signed_cert_timestamp_list,
+ &hs->new_session->tlsext_signed_cert_timestamp_list_length)) {
*out_alert = SSL_AD_INTERNAL_ERROR;
return 0;
}
@@ -1371,16 +1363,17 @@ static int ext_sct_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
/* The extension shouldn't be sent when resuming sessions. */
if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
ssl->s3->session_reused ||
- ssl->signed_cert_timestamp_list == NULL) {
+ ssl->cert->signed_cert_timestamp_list == NULL) {
return 1;
}
CBB contents;
return CBB_add_u16(out, TLSEXT_TYPE_certificate_timestamp) &&
CBB_add_u16_length_prefixed(out, &contents) &&
- CBB_add_bytes(&contents,
- CRYPTO_BUFFER_data(ssl->signed_cert_timestamp_list),
- CRYPTO_BUFFER_len(ssl->signed_cert_timestamp_list)) &&
+ CBB_add_bytes(
+ &contents,
+ CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list),
+ CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) &&
CBB_flush(out);
}
@@ -1852,8 +1845,8 @@ static int ext_ec_point_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
return 1;
}
- const uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
- const uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+ const uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+ const uint32_t alg_a = hs->new_cipher->algorithm_auth;
const int using_ecc = (alg_k & SSL_kECDHE) || (alg_a & SSL_aECDSA);
if (!using_ecc) {
@@ -2218,7 +2211,6 @@ static int ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
int ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t **out_secret,
size_t *out_secret_len,
uint8_t *out_alert, CBS *contents) {
- SSL *const ssl = hs->ssl;
CBS peer_key;
uint16_t group_id;
if (!CBS_get_u16(contents, &group_id) ||
@@ -2240,7 +2232,7 @@ int ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t **out_secret,
return 0;
}
- ssl->s3->new_session->group_id = group_id;
+ hs->new_session->group_id = group_id;
SSL_ECDH_CTX_cleanup(&hs->ecdh_ctx);
return 1;
}
@@ -2322,7 +2314,6 @@ int ssl_ext_key_share_parse_clienthello(SSL_HANDSHAKE *hs, int *out_found,
}
int ssl_ext_key_share_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
- SSL *const ssl = hs->ssl;
uint16_t group_id;
CBB kse_bytes, public_key;
if (!tls1_get_shared_group(hs, &group_id) ||
@@ -2339,7 +2330,7 @@ int ssl_ext_key_share_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
hs->public_key = NULL;
hs->public_key_len = 0;
- ssl->s3->new_session->group_id = group_id;
+ hs->new_session->group_id = group_id;
return 1;
}
@@ -3518,7 +3509,7 @@ int tls1_channel_id_hash(SSL_HANDSHAKE *hs, uint8_t *out, size_t *out_len) {
}
/* tls1_record_handshake_hashes_for_channel_id records the current handshake
- * hashes in |ssl->s3->new_session| so that Channel ID resumptions can sign that
+ * hashes in |hs->new_session| so that Channel ID resumptions can sign that
* data. */
int tls1_record_handshake_hashes_for_channel_id(SSL_HANDSHAKE *hs) {
SSL *const ssl = hs->ssl;
@@ -3530,18 +3521,18 @@ int tls1_record_handshake_hashes_for_channel_id(SSL_HANDSHAKE *hs) {
}
OPENSSL_COMPILE_ASSERT(
- sizeof(ssl->s3->new_session->original_handshake_hash) == EVP_MAX_MD_SIZE,
+ sizeof(hs->new_session->original_handshake_hash) == EVP_MAX_MD_SIZE,
original_handshake_hash_is_too_small);
size_t digest_len;
if (!SSL_TRANSCRIPT_get_hash(&hs->transcript,
- ssl->s3->new_session->original_handshake_hash,
+ hs->new_session->original_handshake_hash,
&digest_len)) {
return -1;
}
OPENSSL_COMPILE_ASSERT(EVP_MAX_MD_SIZE <= 0xff, max_md_size_is_too_large);
- ssl->s3->new_session->original_handshake_hash_len = (uint8_t)digest_len;
+ hs->new_session->original_handshake_hash_len = (uint8_t)digest_len;
return 1;
}
diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc
index 381f4c2f..dd61ffb4 100644
--- a/src/ssl/test/bssl_shim.cc
+++ b/src/ssl/test/bssl_shim.cc
@@ -1584,13 +1584,11 @@ static bool DoExchange(bssl::UniquePtr<SSL_SESSION> *out_session,
!SSL_set_srtp_profiles(ssl.get(), config->srtp_profiles.c_str())) {
return false;
}
- if (config->enable_ocsp_stapling &&
- !SSL_enable_ocsp_stapling(ssl.get())) {
- return false;
+ if (config->enable_ocsp_stapling) {
+ SSL_enable_ocsp_stapling(ssl.get());
}
- if (config->enable_signed_cert_timestamps &&
- !SSL_enable_signed_cert_timestamps(ssl.get())) {
- return false;
+ if (config->enable_signed_cert_timestamps) {
+ SSL_enable_signed_cert_timestamps(ssl.get());
}
if (config->min_version != 0 &&
!SSL_set_min_proto_version(ssl.get(), (uint16_t)config->min_version)) {
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index d6e984a0..d7bad5bf 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -6395,7 +6395,7 @@ func addRenegotiationTests() {
// this case. https://crbug.com/boringssl/130
})
- // Stray HelloRequests during the handshake are ignored in TLS 1.2.
+ // We reject stray HelloRequests during the handshake in TLS 1.2.
testCases = append(testCases, testCase{
name: "StrayHelloRequest",
config: Config{
@@ -6404,6 +6404,8 @@ func addRenegotiationTests() {
SendHelloRequestBeforeEveryHandshakeMessage: true,
},
},
+ shouldFail: true,
+ expectedError: ":UNEXPECTED_MESSAGE:",
})
testCases = append(testCases, testCase{
name: "StrayHelloRequest-Packed",
@@ -6414,6 +6416,8 @@ func addRenegotiationTests() {
SendHelloRequestBeforeEveryHandshakeMessage: true,
},
},
+ shouldFail: true,
+ expectedError: ":UNEXPECTED_MESSAGE:",
})
// Test renegotiation works if HelloRequest and server Finished come in
diff --git a/src/ssl/tls13_both.c b/src/ssl/tls13_both.c
index 19dd555b..91cae9ad 100644
--- a/src/ssl/tls13_both.c
+++ b/src/ssl/tls13_both.c
@@ -211,7 +211,7 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) {
if (retain_sha256) {
/* Retain the hash of the leaf certificate if requested. */
SHA256(CBS_data(&certificate), CBS_len(&certificate),
- ssl->s3->new_session->peer_sha256);
+ hs->new_session->peer_sha256);
}
}
@@ -262,8 +262,8 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) {
}
if (sk_CRYPTO_BUFFER_num(certs) == 1 &&
- !CBS_stow(&ocsp_response, &ssl->s3->new_session->ocsp_response,
- &ssl->s3->new_session->ocsp_response_length)) {
+ !CBS_stow(&ocsp_response, &hs->new_session->ocsp_response,
+ &hs->new_session->ocsp_response_length)) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
goto err;
}
@@ -283,10 +283,9 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) {
}
if (sk_CRYPTO_BUFFER_num(certs) == 1 &&
- !CBS_stow(&sct,
- &ssl->s3->new_session->tlsext_signed_cert_timestamp_list,
- &ssl->s3->new_session
- ->tlsext_signed_cert_timestamp_list_length)) {
+ !CBS_stow(
+ &sct, &hs->new_session->tlsext_signed_cert_timestamp_list,
+ &hs->new_session->tlsext_signed_cert_timestamp_list_length)) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
goto err;
}
@@ -303,17 +302,17 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) {
hs->peer_pubkey = pkey;
pkey = NULL;
- sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free);
- ssl->s3->new_session->certs = certs;
+ sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
+ hs->new_session->certs = certs;
certs = NULL;
- if (!ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) {
+ if (!ssl->ctx->x509_method->session_cache_objects(hs->new_session)) {
OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR);
goto err;
}
- if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) {
+ if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) {
if (!allow_anonymous) {
OPENSSL_PUT_ERROR(SSL, SSL_R_PEER_DID_NOT_RETURN_A_CERTIFICATE);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_CERTIFICATE_REQUIRED);
@@ -322,17 +321,17 @@ int tls13_process_certificate(SSL_HANDSHAKE *hs, int allow_anonymous) {
/* OpenSSL returns X509_V_OK when no certificates are requested. This is
* classed by them as a bug, but it's assumed by at least NGINX. */
- ssl->s3->new_session->verify_result = X509_V_OK;
+ hs->new_session->verify_result = X509_V_OK;
/* No certificate, so nothing more to do. */
ret = 1;
goto err;
}
- ssl->s3->new_session->peer_sha256_valid = retain_sha256;
+ hs->new_session->peer_sha256_valid = retain_sha256;
- if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result,
- ssl->s3->new_session->x509_chain)) {
+ if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result,
+ hs->new_session->x509_chain)) {
goto err;
}
@@ -370,7 +369,7 @@ int tls13_process_certificate_verify(SSL_HANDSHAKE *hs) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, al);
goto err;
}
- ssl->s3->new_session->peer_signature_algorithm = signature_algorithm;
+ hs->new_session->peer_signature_algorithm = signature_algorithm;
if (!tls13_get_cert_verify_signature_input(
hs, &msg, &msg_len,
@@ -452,13 +451,14 @@ int tls13_add_certificate(SSL_HANDSHAKE *hs) {
goto err;
}
- if (hs->scts_requested && ssl->signed_cert_timestamp_list != NULL) {
+ if (hs->scts_requested && ssl->cert->signed_cert_timestamp_list != NULL) {
CBB contents;
if (!CBB_add_u16(&extensions, TLSEXT_TYPE_certificate_timestamp) ||
!CBB_add_u16_length_prefixed(&extensions, &contents) ||
- !CBB_add_bytes(&contents,
- CRYPTO_BUFFER_data(ssl->signed_cert_timestamp_list),
- CRYPTO_BUFFER_len(ssl->signed_cert_timestamp_list)) ||
+ !CBB_add_bytes(
+ &contents,
+ CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list),
+ CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) ||
!CBB_flush(&extensions)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
goto err;
@@ -466,14 +466,15 @@ int tls13_add_certificate(SSL_HANDSHAKE *hs) {
}
if (hs->ocsp_stapling_requested &&
- ssl->ocsp_response != NULL) {
+ ssl->cert->ocsp_response != NULL) {
CBB contents, ocsp_response;
if (!CBB_add_u16(&extensions, TLSEXT_TYPE_status_request) ||
!CBB_add_u16_length_prefixed(&extensions, &contents) ||
!CBB_add_u8(&contents, TLSEXT_STATUSTYPE_ocsp) ||
!CBB_add_u24_length_prefixed(&contents, &ocsp_response) ||
- !CBB_add_bytes(&ocsp_response, CRYPTO_BUFFER_data(ssl->ocsp_response),
- CRYPTO_BUFFER_len(ssl->ocsp_response)) ||
+ !CBB_add_bytes(&ocsp_response,
+ CRYPTO_BUFFER_data(ssl->cert->ocsp_response),
+ CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) ||
!CBB_flush(&extensions)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
goto err;
diff --git a/src/ssl/tls13_client.c b/src/ssl/tls13_client.c
index 50f7e5a6..8e994e58 100644
--- a/src/ssl/tls13_client.c
+++ b/src/ssl/tls13_client.c
@@ -251,24 +251,34 @@ static enum ssl_hs_wait_t do_process_server_hello(SSL_HANDSHAKE *hs) {
ssl->s3->session_reused = 1;
/* Only authentication information carries over in TLS 1.3. */
- ssl->s3->new_session =
- SSL_SESSION_dup(ssl->session, SSL_SESSION_DUP_AUTH_ONLY);
- if (ssl->s3->new_session == NULL) {
+ hs->new_session = SSL_SESSION_dup(ssl->session, SSL_SESSION_DUP_AUTH_ONLY);
+ if (hs->new_session == NULL) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
return ssl_hs_error;
}
ssl_set_session(ssl, NULL);
/* Resumption incorporates fresh key material, so refresh the timeout. */
- ssl_session_renew_timeout(ssl, ssl->s3->new_session,
+ ssl_session_renew_timeout(ssl, hs->new_session,
ssl->initial_ctx->session_psk_dhe_timeout);
} else if (!ssl_get_new_session(hs, 0)) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
return ssl_hs_error;
}
- ssl->s3->new_session->cipher = cipher;
- ssl->s3->tmp.new_cipher = cipher;
+ hs->new_session->cipher = cipher;
+ hs->new_cipher = cipher;
+
+ /* Store the initial negotiated ALPN in the session. */
+ if (ssl->s3->alpn_selected != NULL) {
+ hs->new_session->early_alpn =
+ BUF_memdup(ssl->s3->alpn_selected, ssl->s3->alpn_selected_len);
+ if (hs->new_session->early_alpn == NULL) {
+ ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
+ return ssl_hs_error;
+ }
+ hs->new_session->early_alpn_len = ssl->s3->alpn_selected_len;
+ }
/* The PRF hash is now known. Set up the key schedule. */
if (!tls13_init_key_schedule(hs)) {
@@ -277,8 +287,8 @@ static enum ssl_hs_wait_t do_process_server_hello(SSL_HANDSHAKE *hs) {
/* Incorporate the PSK into the running secret. */
if (ssl->s3->session_reused) {
- if (!tls13_advance_key_schedule(hs, ssl->s3->new_session->master_key,
- ssl->s3->new_session->master_key_length)) {
+ if (!tls13_advance_key_schedule(hs, hs->new_session->master_key,
+ hs->new_session->master_key_length)) {
return ssl_hs_error;
}
} else if (!tls13_advance_key_schedule(hs, kZeroes, hs->hash_len)) {
diff --git a/src/ssl/tls13_enc.c b/src/ssl/tls13_enc.c
index 4d140e3c..412705da 100644
--- a/src/ssl/tls13_enc.c
+++ b/src/ssl/tls13_enc.c
@@ -30,7 +30,7 @@
int tls13_init_key_schedule(SSL_HANDSHAKE *hs) {
if (!SSL_TRANSCRIPT_init_hash(&hs->transcript, ssl3_protocol_version(hs->ssl),
- hs->ssl->s3->tmp.new_cipher->algorithm_prf)) {
+ hs->new_cipher->algorithm_prf)) {
return 0;
}
@@ -237,17 +237,15 @@ int tls13_rotate_traffic_key(SSL *ssl, enum evp_aead_direction_t direction) {
static const char kTLS13LabelResumption[] = "resumption master secret";
int tls13_derive_resumption_secret(SSL_HANDSHAKE *hs) {
- SSL *const ssl = hs->ssl;
- if (ssl->s3->hs->hash_len > SSL_MAX_MASTER_KEY_LENGTH) {
+ if (hs->hash_len > SSL_MAX_MASTER_KEY_LENGTH) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
return 0;
}
- ssl->s3->new_session->master_key_length = hs->hash_len;
- return derive_secret(hs, ssl->s3->new_session->master_key,
- ssl->s3->new_session->master_key_length,
- (const uint8_t *)kTLS13LabelResumption,
- strlen(kTLS13LabelResumption));
+ hs->new_session->master_key_length = hs->hash_len;
+ return derive_secret(
+ hs, hs->new_session->master_key, hs->new_session->master_key_length,
+ (const uint8_t *)kTLS13LabelResumption, strlen(kTLS13LabelResumption));
}
static const char kTLS13LabelFinished[] = "finished";
diff --git a/src/ssl/tls13_server.c b/src/ssl/tls13_server.c
index 0278b500..402c2343 100644
--- a/src/ssl/tls13_server.c
+++ b/src/ssl/tls13_server.c
@@ -150,8 +150,8 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
}
/* Negotiate the cipher suite. */
- ssl->s3->tmp.new_cipher = choose_tls13_cipher(ssl, &client_hello);
- if (ssl->s3->tmp.new_cipher == NULL) {
+ hs->new_cipher = choose_tls13_cipher(ssl, &client_hello);
+ if (hs->new_cipher == NULL) {
OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER);
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
return ssl_hs_error;
@@ -189,7 +189,7 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
}
if (session != NULL &&
- !ssl_session_is_resumable(ssl, session)) {
+ !ssl_session_is_resumable(hs, session)) {
SSL_SESSION_free(session);
session = NULL;
}
@@ -202,13 +202,13 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
return ssl_hs_error;
}
- ssl->s3->new_session->cipher = ssl->s3->tmp.new_cipher;
+ hs->new_session->cipher = hs->new_cipher;
/* On new sessions, stash the SNI value in the session. */
if (hs->hostname != NULL) {
- OPENSSL_free(ssl->s3->new_session->tlsext_hostname);
- ssl->s3->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
- if (ssl->s3->new_session->tlsext_hostname == NULL) {
+ OPENSSL_free(hs->new_session->tlsext_hostname);
+ hs->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
+ if (hs->new_session->tlsext_hostname == NULL) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
return ssl_hs_error;
}
@@ -222,8 +222,8 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
}
/* Only authentication information carries over in TLS 1.3. */
- ssl->s3->new_session = SSL_SESSION_dup(session, SSL_SESSION_DUP_AUTH_ONLY);
- if (ssl->s3->new_session == NULL) {
+ hs->new_session = SSL_SESSION_dup(session, SSL_SESSION_DUP_AUTH_ONLY);
+ if (hs->new_session == NULL) {
ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
return ssl_hs_error;
}
@@ -231,7 +231,7 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
SSL_SESSION_free(session);
/* Resumption incorporates fresh key material, so refresh the timeout. */
- ssl_session_renew_timeout(ssl, ssl->s3->new_session,
+ ssl_session_renew_timeout(ssl, hs->new_session,
ssl->initial_ctx->session_psk_dhe_timeout);
}
@@ -251,10 +251,21 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
return ssl_hs_error;
}
+ /* Store the initial negotiated ALPN in the session. */
+ if (ssl->s3->alpn_selected != NULL) {
+ hs->new_session->early_alpn =
+ BUF_memdup(ssl->s3->alpn_selected, ssl->s3->alpn_selected_len);
+ if (hs->new_session->early_alpn == NULL) {
+ ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
+ return ssl_hs_error;
+ }
+ hs->new_session->early_alpn_len = ssl->s3->alpn_selected_len;
+ }
+
/* Incorporate the PSK into the running secret. */
if (ssl->s3->session_reused) {
- if (!tls13_advance_key_schedule(hs, ssl->s3->new_session->master_key,
- ssl->s3->new_session->master_key_length)) {
+ if (!tls13_advance_key_schedule(hs, hs->new_session->master_key,
+ hs->new_session->master_key_length)) {
return ssl_hs_error;
}
} else if (!tls13_advance_key_schedule(hs, kZeroes, hs->hash_len)) {
@@ -340,7 +351,7 @@ static enum ssl_hs_wait_t do_send_server_hello(SSL_HANDSHAKE *hs) {
!CBB_add_u16(&body, ssl->version) ||
!RAND_bytes(ssl->s3->server_random, sizeof(ssl->s3->server_random)) ||
!CBB_add_bytes(&body, ssl->s3->server_random, SSL3_RANDOM_SIZE) ||
- !CBB_add_u16(&body, ssl_cipher_get_value(ssl->s3->tmp.new_cipher)) ||
+ !CBB_add_u16(&body, ssl_cipher_get_value(hs->new_cipher)) ||
!CBB_add_u16_length_prefixed(&body, &extensions) ||
!ssl_ext_pre_shared_key_add_serverhello(hs, &extensions) ||
!ssl_ext_key_share_add_serverhello(hs, &extensions)) {
@@ -472,7 +483,7 @@ static enum ssl_hs_wait_t do_process_client_certificate(SSL_HANDSHAKE *hs) {
if (!hs->cert_request) {
/* OpenSSL returns X509_V_OK when no certificates are requested. This is
* classed by them as a bug, but it's assumed by at least NGINX. */
- ssl->s3->new_session->verify_result = X509_V_OK;
+ hs->new_session->verify_result = X509_V_OK;
/* Skip this state. */
hs->tls13_state = state_process_channel_id;
@@ -495,7 +506,7 @@ static enum ssl_hs_wait_t do_process_client_certificate(SSL_HANDSHAKE *hs) {
static enum ssl_hs_wait_t do_process_client_certificate_verify(
SSL_HANDSHAKE *hs) {
SSL *const ssl = hs->ssl;
- if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) {
+ if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) {
/* Skip this state. */
hs->tls13_state = state_process_channel_id;
return ssl_hs_ok;
@@ -543,7 +554,7 @@ static enum ssl_hs_wait_t do_process_client_finished(SSL_HANDSHAKE *hs) {
/* Rebase the session timestamp so that it is measured from ticket
* issuance. */
- ssl_session_rebase_time(ssl, ssl->s3->new_session);
+ ssl_session_rebase_time(ssl, hs->new_session);
hs->tls13_state = state_send_new_session_ticket;
return ssl_hs_ok;
}
@@ -561,7 +572,7 @@ static enum ssl_hs_wait_t do_send_new_session_ticket(SSL_HANDSHAKE *hs) {
return ssl_hs_ok;
}
- SSL_SESSION *session = ssl->s3->new_session;
+ SSL_SESSION *session = hs->new_session;
CBB cbb;
CBB_zero(&cbb);
diff --git a/src/tool/transport_common.cc b/src/tool/transport_common.cc
index cd3e0d69..5f1a366a 100644
--- a/src/tool/transport_common.cc
+++ b/src/tool/transport_common.cc
@@ -285,6 +285,11 @@ void PrintConnectionInfo(const SSL *ssl) {
size_t ocsp_staple_len;
SSL_get0_ocsp_response(ssl, &ocsp_staple, &ocsp_staple_len);
fprintf(stderr, " OCSP staple: %s\n", ocsp_staple_len > 0 ? "yes" : "no");
+
+ const uint8_t *sct_list;
+ size_t sct_list_len;
+ SSL_get0_signed_cert_timestamp_list(ssl, &sct_list, &sct_list_len);
+ fprintf(stderr, " SCT list: %s\n", sct_list_len > 0 ? "yes" : "no");
}
// Print the server cert subject and issuer names.
diff --git a/src/util/all_tests.json b/src/util/all_tests.json
index 76637b2c..fc49c698 100644
--- a/src/util/all_tests.json
+++ b/src/util/all_tests.json
@@ -5,7 +5,6 @@
["crypto/bio/bio_test"],
["crypto/bn/bn_test", "crypto/bn/bn_tests.txt"],
["crypto/bytestring/bytestring_test"],
- ["crypto/chacha/chacha_test"],
["crypto/cipher/aead_test", "aes-128-gcm", "crypto/cipher/test/aes_128_gcm_tests.txt"],
["crypto/cipher/aead_test", "aes-256-gcm", "crypto/cipher/test/aes_256_gcm_tests.txt"],
["crypto/cipher/aead_test", "aes-128-gcm-siv", "crypto/cipher/test/aes_128_gcm_siv_tests.txt"],
@@ -33,7 +32,6 @@
["crypto/curve25519/x25519_test"],
["crypto/curve25519/spake25519_test"],
["crypto/digest/digest_test"],
- ["crypto/ec/ec_test"],
["crypto/ec/example_mul"],
["crypto/ec/p256-x86_64_test", "crypto/ec/p256-x86_64_tests.txt"],
["crypto/ecdh/ecdh_test", "crypto/ecdh/ecdh_tests.txt"],
@@ -53,7 +51,6 @@
["crypto/poly1305/poly1305_test", "crypto/poly1305/poly1305_tests.txt"],
["crypto/pool/pool_test"],
["crypto/refcount_test"],
- ["crypto/rsa/rsa_test"],
["crypto/thread_test"],
["crypto/x509/pkcs7_test"],
["crypto/x509/x509_test"],
diff --git a/src/util/doc.config b/src/util/doc.config
index ddd56db2..f7e8baa1 100644
--- a/src/util/doc.config
+++ b/src/util/doc.config
@@ -16,8 +16,7 @@
"include/openssl/obj.h",
"include/openssl/pool.h",
"include/openssl/rand.h",
- "include/openssl/stack.h",
- "include/openssl/time_support.h"
+ "include/openssl/stack.h"
]
},{
"Name": "Low-level crypto primitives",
diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py
index 8be7c906..a3435f2b 100644
--- a/src/util/generate_build_files.py
+++ b/src/util/generate_build_files.py
@@ -50,20 +50,6 @@ NON_PERL_FILES = {
],
}
-# For now, GTest-based tests are specified manually. Once everything has updated
-# to support GTest, these will be determined automatically by looking for files
-# ending with _test.cc.
-CRYPTO_TEST_SOURCES = [
- 'src/crypto/dh/dh_test.cc',
- 'src/crypto/dsa/dsa_test.cc',
-]
-DECREPIT_TEST_SOURCES = [
- 'src/decrepit/decrepit_test.cc',
-]
-SSL_TEST_SOURCES = [
- 'src/ssl/ssl_test.cc',
-]
-
PREFIX = None
@@ -464,13 +450,6 @@ def OnlyTests(dent, is_dir):
non-test sources."""
if is_dir:
return dent != 'test'
- # For now, GTest-based tests are specified manually.
- if dent in [os.path.basename(p) for p in CRYPTO_TEST_SOURCES]:
- return False
- if dent in [os.path.basename(p) for p in DECREPIT_TEST_SOURCES]:
- return False
- if dent in [os.path.basename(p) for p in SSL_TEST_SOURCES]:
- return False
return '_test.' in dent or dent.startswith('example_')
@@ -624,6 +603,11 @@ def WriteAsmFiles(perlasms):
return asmfiles
+def IsGTest(path):
+ with open(path) as f:
+ return "#include <gtest/gtest.h>" in f.read()
+
+
def main(platforms):
crypto_c_files = FindCFiles(os.path.join('src', 'crypto'), NoTests)
ssl_source_files = FindCFiles(os.path.join('src', 'ssl'), NoTests)
@@ -643,8 +627,17 @@ def main(platforms):
FindHeaderFiles(os.path.join('src', 'crypto', 'test'), AllFiles) +
FindHeaderFiles(os.path.join('src', 'ssl', 'test'), AllFiles))
- test_c_files = FindCFiles(os.path.join('src', 'crypto'), OnlyTests)
- test_c_files += FindCFiles(os.path.join('src', 'ssl'), OnlyTests)
+ test_c_files = []
+ crypto_test_files = ['src/crypto/test/gtest_main.cc']
+ # TODO(davidben): Remove this loop once all tests are converted.
+ for path in FindCFiles(os.path.join('src', 'crypto'), OnlyTests):
+ if IsGTest(path):
+ crypto_test_files.append(path)
+ else:
+ test_c_files.append(path)
+
+ ssl_test_files = FindCFiles(os.path.join('src', 'ssl'), OnlyTests)
+ ssl_test_files.append('src/crypto/test/gtest_main.cc')
fuzz_c_files = FindCFiles(os.path.join('src', 'fuzz'), NoTests)
@@ -689,15 +682,14 @@ def main(platforms):
'crypto': crypto_c_files,
'crypto_headers': crypto_h_files,
'crypto_internal_headers': crypto_internal_h_files,
- 'crypto_test': sorted(CRYPTO_TEST_SOURCES +
- ['src/crypto/test/gtest_main.cc']),
+ 'crypto_test': sorted(crypto_test_files),
'fuzz': fuzz_c_files,
'ssl': ssl_source_files,
'ssl_c': [s for s in ssl_source_files if s.endswith('.c')],
'ssl_cc': [s for s in ssl_source_files if s.endswith('.cc')],
'ssl_headers': ssl_h_files,
'ssl_internal_headers': ssl_internal_h_files,
- 'ssl_test': sorted(SSL_TEST_SOURCES + ['src/crypto/test/gtest_main.cc']),
+ 'ssl_test': sorted(ssl_test_files),
'tool': tool_c_files,
'tool_headers': tool_h_files,
'test': test_c_files,
diff --git a/win-x86/crypto/bn/x86-mont.asm b/win-x86/crypto/bn/x86-mont.asm
index de7b9499..b1a4d594 100644
--- a/win-x86/crypto/bn/x86-mont.asm
+++ b/win-x86/crypto/bn/x86-mont.asm
@@ -29,36 +29,51 @@ L$_bn_mul_mont_begin:
jl NEAR L$000just_leave
lea esi,[20+esp]
lea edx,[24+esp]
- mov ebp,esp
add edi,2
neg edi
- lea esp,[edi*4+esp-32]
+ lea ebp,[edi*4+esp-32]
neg edi
- mov eax,esp
+ mov eax,ebp
sub eax,edx
and eax,2047
- sub esp,eax
- xor edx,esp
+ sub ebp,eax
+ xor edx,ebp
and edx,2048
xor edx,2048
- sub esp,edx
- and esp,-64
+ sub ebp,edx
+ and ebp,-64
+ mov eax,esp
+ sub eax,ebp
+ and eax,-4096
+ mov edx,esp
+ lea esp,[eax*1+ebp]
+ mov eax,DWORD [esp]
+ cmp esp,ebp
+ ja NEAR L$001page_walk
+ jmp NEAR L$002page_walk_done
+align 16
+L$001page_walk:
+ lea esp,[esp-4096]
+ mov eax,DWORD [esp]
+ cmp esp,ebp
+ ja NEAR L$001page_walk
+L$002page_walk_done:
mov eax,DWORD [esi]
mov ebx,DWORD [4+esi]
mov ecx,DWORD [8+esi]
- mov edx,DWORD [12+esi]
+ mov ebp,DWORD [12+esi]
mov esi,DWORD [16+esi]
mov esi,DWORD [esi]
mov DWORD [4+esp],eax
mov DWORD [8+esp],ebx
mov DWORD [12+esp],ecx
- mov DWORD [16+esp],edx
+ mov DWORD [16+esp],ebp
mov DWORD [20+esp],esi
lea ebx,[edi-3]
- mov DWORD [24+esp],ebp
+ mov DWORD [24+esp],edx
lea eax,[_OPENSSL_ia32cap_P]
bt DWORD [eax],26
- jnc NEAR L$001non_sse2
+ jnc NEAR L$003non_sse2
mov eax,-1
movd mm7,eax
mov esi,DWORD [8+esp]
@@ -82,7 +97,7 @@ L$_bn_mul_mont_begin:
psrlq mm3,32
inc ecx
align 16
-L$0021st:
+L$0041st:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -97,7 +112,7 @@ L$0021st:
psrlq mm3,32
lea ecx,[1+ecx]
cmp ecx,ebx
- jl NEAR L$0021st
+ jl NEAR L$0041st
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -111,7 +126,7 @@ L$0021st:
paddq mm3,mm2
movq [32+ebx*4+esp],mm3
inc edx
-L$003outer:
+L$005outer:
xor ecx,ecx
movd mm4,DWORD [edx*4+edi]
movd mm5,DWORD [esi]
@@ -133,7 +148,7 @@ L$003outer:
paddq mm2,mm6
inc ecx
dec ebx
-L$004inner:
+L$006inner:
pmuludq mm0,mm4
pmuludq mm1,mm5
paddq mm2,mm0
@@ -150,7 +165,7 @@ L$004inner:
paddq mm2,mm6
dec ebx
lea ecx,[1+ecx]
- jnz NEAR L$004inner
+ jnz NEAR L$006inner
mov ebx,ecx
pmuludq mm0,mm4
pmuludq mm1,mm5
@@ -168,11 +183,11 @@ L$004inner:
movq [32+ebx*4+esp],mm3
lea edx,[1+edx]
cmp edx,ebx
- jle NEAR L$003outer
+ jle NEAR L$005outer
emms
- jmp NEAR L$005common_tail
+ jmp NEAR L$007common_tail
align 16
-L$001non_sse2:
+L$003non_sse2:
mov esi,DWORD [8+esp]
lea ebp,[1+ebx]
mov edi,DWORD [12+esp]
@@ -183,12 +198,12 @@ L$001non_sse2:
lea eax,[4+ebx*4+edi]
or ebp,edx
mov edi,DWORD [edi]
- jz NEAR L$006bn_sqr_mont
+ jz NEAR L$008bn_sqr_mont
mov DWORD [28+esp],eax
mov eax,DWORD [esi]
xor edx,edx
align 16
-L$007mull:
+L$009mull:
mov ebp,edx
mul edi
add ebp,eax
@@ -197,7 +212,7 @@ L$007mull:
mov eax,DWORD [ecx*4+esi]
cmp ecx,ebx
mov DWORD [28+ecx*4+esp],ebp
- jl NEAR L$007mull
+ jl NEAR L$009mull
mov ebp,edx
mul edi
mov edi,DWORD [20+esp]
@@ -215,9 +230,9 @@ L$007mull:
mov eax,DWORD [4+esi]
adc edx,0
inc ecx
- jmp NEAR L$0082ndmadd
+ jmp NEAR L$0102ndmadd
align 16
-L$0091stmadd:
+L$0111stmadd:
mov ebp,edx
mul edi
add ebp,DWORD [32+ecx*4+esp]
@@ -228,7 +243,7 @@ L$0091stmadd:
adc edx,0
cmp ecx,ebx
mov DWORD [28+ecx*4+esp],ebp
- jl NEAR L$0091stmadd
+ jl NEAR L$0111stmadd
mov ebp,edx
mul edi
add eax,DWORD [32+ebx*4+esp]
@@ -251,7 +266,7 @@ L$0091stmadd:
adc edx,0
mov ecx,1
align 16
-L$0082ndmadd:
+L$0102ndmadd:
mov ebp,edx
mul edi
add ebp,DWORD [32+ecx*4+esp]
@@ -262,7 +277,7 @@ L$0082ndmadd:
adc edx,0
cmp ecx,ebx
mov DWORD [24+ecx*4+esp],ebp
- jl NEAR L$0082ndmadd
+ jl NEAR L$0102ndmadd
mov ebp,edx
mul edi
add ebp,DWORD [32+ebx*4+esp]
@@ -278,16 +293,16 @@ L$0082ndmadd:
mov DWORD [32+ebx*4+esp],edx
cmp ecx,DWORD [28+esp]
mov DWORD [36+ebx*4+esp],eax
- je NEAR L$005common_tail
+ je NEAR L$007common_tail
mov edi,DWORD [ecx]
mov esi,DWORD [8+esp]
mov DWORD [12+esp],ecx
xor ecx,ecx
xor edx,edx
mov eax,DWORD [esi]
- jmp NEAR L$0091stmadd
+ jmp NEAR L$0111stmadd
align 16
-L$006bn_sqr_mont:
+L$008bn_sqr_mont:
mov DWORD [esp],ebx
mov DWORD [12+esp],ecx
mov eax,edi
@@ -298,7 +313,7 @@ L$006bn_sqr_mont:
and ebx,1
inc ecx
align 16
-L$010sqr:
+L$012sqr:
mov eax,DWORD [ecx*4+esi]
mov ebp,edx
mul edi
@@ -310,7 +325,7 @@ L$010sqr:
cmp ecx,DWORD [esp]
mov ebx,eax
mov DWORD [28+ecx*4+esp],ebp
- jl NEAR L$010sqr
+ jl NEAR L$012sqr
mov eax,DWORD [ecx*4+esi]
mov ebp,edx
mul edi
@@ -334,7 +349,7 @@ L$010sqr:
mov eax,DWORD [4+esi]
mov ecx,1
align 16
-L$0113rdmadd:
+L$0133rdmadd:
mov ebp,edx
mul edi
add ebp,DWORD [32+ecx*4+esp]
@@ -353,7 +368,7 @@ L$0113rdmadd:
adc edx,0
cmp ecx,ebx
mov DWORD [24+ecx*4+esp],ebp
- jl NEAR L$0113rdmadd
+ jl NEAR L$0133rdmadd
mov ebp,edx
mul edi
add ebp,DWORD [32+ebx*4+esp]
@@ -369,7 +384,7 @@ L$0113rdmadd:
mov DWORD [32+ebx*4+esp],edx
cmp ecx,ebx
mov DWORD [36+ebx*4+esp],eax
- je NEAR L$005common_tail
+ je NEAR L$007common_tail
mov edi,DWORD [4+ecx*4+esi]
lea ecx,[1+ecx]
mov eax,edi
@@ -381,12 +396,12 @@ L$0113rdmadd:
xor ebp,ebp
cmp ecx,ebx
lea ecx,[1+ecx]
- je NEAR L$012sqrlast
+ je NEAR L$014sqrlast
mov ebx,edx
shr edx,1
and ebx,1
align 16
-L$013sqradd:
+L$015sqradd:
mov eax,DWORD [ecx*4+esi]
mov ebp,edx
mul edi
@@ -402,13 +417,13 @@ L$013sqradd:
cmp ecx,DWORD [esp]
mov DWORD [28+ecx*4+esp],ebp
mov ebx,eax
- jle NEAR L$013sqradd
+ jle NEAR L$015sqradd
mov ebp,edx
add edx,edx
shr ebp,31
add edx,ebx
adc ebp,0
-L$012sqrlast:
+L$014sqrlast:
mov edi,DWORD [20+esp]
mov esi,DWORD [16+esp]
imul edi,DWORD [32+esp]
@@ -423,9 +438,9 @@ L$012sqrlast:
adc edx,0
mov ecx,1
mov eax,DWORD [4+esi]
- jmp NEAR L$0113rdmadd
+ jmp NEAR L$0133rdmadd
align 16
-L$005common_tail:
+L$007common_tail:
mov ebp,DWORD [16+esp]
mov edi,DWORD [4+esp]
lea esi,[32+esp]
@@ -433,25 +448,26 @@ L$005common_tail:
mov ecx,ebx
xor edx,edx
align 16
-L$014sub:
+L$016sub:
sbb eax,DWORD [edx*4+ebp]
mov DWORD [edx*4+edi],eax
dec ecx
mov eax,DWORD [4+edx*4+esi]
lea edx,[1+edx]
- jge NEAR L$014sub
+ jge NEAR L$016sub
sbb eax,0
+ and esi,eax
+ not eax
+ mov ebp,edi
+ and ebp,eax
+ or esi,ebp
align 16
-L$015copy:
- mov edx,DWORD [ebx*4+esi]
- mov ebp,DWORD [ebx*4+edi]
- xor edx,ebp
- and edx,eax
- xor edx,ebp
- mov DWORD [ebx*4+esi],ecx
- mov DWORD [ebx*4+edi],edx
+L$017copy:
+ mov eax,DWORD [ebx*4+esi]
+ mov DWORD [ebx*4+edi],eax
+ mov DWORD [32+ebx*4+esp],ecx
dec ebx
- jge NEAR L$015copy
+ jge NEAR L$017copy
mov esp,DWORD [24+esp]
mov eax,1
L$000just_leave:
diff --git a/win-x86_64/crypto/aes/aes-x86_64.asm b/win-x86_64/crypto/aes/aes-x86_64.asm
index 53394f0e..3db1846e 100644
--- a/win-x86_64/crypto/aes/aes-x86_64.asm
+++ b/win-x86_64/crypto/aes/aes-x86_64.asm
@@ -344,6 +344,7 @@ $L$SEH_begin_asm_AES_encrypt:
mov rdx,r8
+ mov rax,rsp
push rbx
push rbp
push r12
@@ -352,7 +353,6 @@ $L$SEH_begin_asm_AES_encrypt:
push r15
- mov r10,rsp
lea rcx,[((-63))+rdx]
and rsp,-64
sub rcx,rsp
@@ -362,7 +362,7 @@ $L$SEH_begin_asm_AES_encrypt:
sub rsp,32
mov QWORD[16+rsp],rsi
- mov QWORD[24+rsp],r10
+ mov QWORD[24+rsp],rax
$L$enc_prologue:
mov r15,rdx
@@ -394,13 +394,13 @@ $L$enc_prologue:
mov DWORD[8+r9],ecx
mov DWORD[12+r9],edx
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$enc_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -800,6 +800,7 @@ $L$SEH_begin_asm_AES_decrypt:
mov rdx,r8
+ mov rax,rsp
push rbx
push rbp
push r12
@@ -808,7 +809,6 @@ $L$SEH_begin_asm_AES_decrypt:
push r15
- mov r10,rsp
lea rcx,[((-63))+rdx]
and rsp,-64
sub rcx,rsp
@@ -818,7 +818,7 @@ $L$SEH_begin_asm_AES_decrypt:
sub rsp,32
mov QWORD[16+rsp],rsi
- mov QWORD[24+rsp],r10
+ mov QWORD[24+rsp],rax
$L$dec_prologue:
mov r15,rdx
@@ -852,13 +852,13 @@ $L$dec_prologue:
mov DWORD[8+r9],ecx
mov DWORD[12+r9],edx
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$dec_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -1367,10 +1367,9 @@ $L$cbc_prologue:
mov r9d,r9d
lea r14,[$L$AES_Te]
+ lea r10,[$L$AES_Td]
cmp r9,0
- jne NEAR $L$cbc_picked_te
- lea r14,[$L$AES_Td]
-$L$cbc_picked_te:
+ cmove r14,r10
mov r10d,DWORD[OPENSSL_ia32cap_P]
cmp rdx,512
@@ -2626,7 +2625,6 @@ block_se_handler:
jae NEAR $L$in_block_prologue
mov rax,QWORD[24+rax]
- lea rax,[48+rax]
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
diff --git a/win-x86_64/crypto/aes/aesni-x86_64.asm b/win-x86_64/crypto/aes/aesni-x86_64.asm
index cf313d1a..d5d454d9 100644
--- a/win-x86_64/crypto/aes/aesni-x86_64.asm
+++ b/win-x86_64/crypto/aes/aesni-x86_64.asm
@@ -1129,22 +1129,21 @@ DB 102,15,56,221,209
ALIGN 16
$L$ctr32_bulk:
- lea rax,[rsp]
+ lea r11,[rsp]
push rbp
sub rsp,288
and rsp,-16
- movaps XMMWORD[(-168)+rax],xmm6
- movaps XMMWORD[(-152)+rax],xmm7
- movaps XMMWORD[(-136)+rax],xmm8
- movaps XMMWORD[(-120)+rax],xmm9
- movaps XMMWORD[(-104)+rax],xmm10
- movaps XMMWORD[(-88)+rax],xmm11
- movaps XMMWORD[(-72)+rax],xmm12
- movaps XMMWORD[(-56)+rax],xmm13
- movaps XMMWORD[(-40)+rax],xmm14
- movaps XMMWORD[(-24)+rax],xmm15
+ movaps XMMWORD[(-168)+r11],xmm6
+ movaps XMMWORD[(-152)+r11],xmm7
+ movaps XMMWORD[(-136)+r11],xmm8
+ movaps XMMWORD[(-120)+r11],xmm9
+ movaps XMMWORD[(-104)+r11],xmm10
+ movaps XMMWORD[(-88)+r11],xmm11
+ movaps XMMWORD[(-72)+r11],xmm12
+ movaps XMMWORD[(-56)+r11],xmm13
+ movaps XMMWORD[(-40)+r11],xmm14
+ movaps XMMWORD[(-24)+r11],xmm15
$L$ctr32_body:
- lea rbp,[((-8))+rax]
@@ -1153,7 +1152,7 @@ $L$ctr32_body:
movdqu xmm0,XMMWORD[rcx]
mov r8d,DWORD[12+r8]
pxor xmm2,xmm0
- mov r11d,DWORD[12+rcx]
+ mov ebp,DWORD[12+rcx]
movdqa XMMWORD[rsp],xmm2
bswap r8d
movdqa xmm3,xmm2
@@ -1169,8 +1168,8 @@ $L$ctr32_body:
lea rdx,[2+r8]
bswap eax
bswap edx
- xor eax,r11d
- xor edx,r11d
+ xor eax,ebp
+ xor edx,ebp
DB 102,15,58,34,216,3
lea rax,[3+r8]
movdqa XMMWORD[16+rsp],xmm3
@@ -1179,25 +1178,25 @@ DB 102,15,58,34,226,3
mov rdx,r10
lea r10,[4+r8]
movdqa XMMWORD[32+rsp],xmm4
- xor eax,r11d
+ xor eax,ebp
bswap r10d
DB 102,15,58,34,232,3
- xor r10d,r11d
+ xor r10d,ebp
movdqa XMMWORD[48+rsp],xmm5
lea r9,[5+r8]
mov DWORD[((64+12))+rsp],r10d
bswap r9d
lea r10,[6+r8]
mov eax,DWORD[240+rcx]
- xor r9d,r11d
+ xor r9d,ebp
bswap r10d
mov DWORD[((80+12))+rsp],r9d
- xor r10d,r11d
+ xor r10d,ebp
lea r9,[7+r8]
mov DWORD[((96+12))+rsp],r10d
bswap r9d
mov r10d,DWORD[((OPENSSL_ia32cap_P+4))]
- xor r9d,r11d
+ xor r9d,ebp
and r10d,71303168
mov DWORD[((112+12))+rsp],r9d
@@ -1221,7 +1220,7 @@ ALIGN 16
$L$ctr32_6x:
shl eax,4
mov r10d,48
- bswap r11d
+ bswap ebp
lea rcx,[32+rax*1+rcx]
sub r10,rax
jmp NEAR $L$ctr32_loop6
@@ -1232,32 +1231,32 @@ $L$ctr32_loop6:
movups xmm0,XMMWORD[((-48))+r10*1+rcx]
DB 102,15,56,220,209
mov eax,r8d
- xor eax,r11d
+ xor eax,ebp
DB 102,15,56,220,217
DB 0x0f,0x38,0xf1,0x44,0x24,12
lea eax,[1+r8]
DB 102,15,56,220,225
- xor eax,r11d
+ xor eax,ebp
DB 0x0f,0x38,0xf1,0x44,0x24,28
DB 102,15,56,220,233
lea eax,[2+r8]
- xor eax,r11d
+ xor eax,ebp
DB 102,15,56,220,241
DB 0x0f,0x38,0xf1,0x44,0x24,44
lea eax,[3+r8]
DB 102,15,56,220,249
movups xmm1,XMMWORD[((-32))+r10*1+rcx]
- xor eax,r11d
+ xor eax,ebp
DB 102,15,56,220,208
DB 0x0f,0x38,0xf1,0x44,0x24,60
lea eax,[4+r8]
DB 102,15,56,220,216
- xor eax,r11d
+ xor eax,ebp
DB 0x0f,0x38,0xf1,0x44,0x24,76
DB 102,15,56,220,224
lea eax,[5+r8]
- xor eax,r11d
+ xor eax,ebp
DB 102,15,56,220,232
DB 0x0f,0x38,0xf1,0x44,0x24,92
mov rax,r10
@@ -1318,7 +1317,7 @@ DB 102,15,56,220,217
bswap r9d
movups xmm0,XMMWORD[((32-128))+rcx]
DB 102,15,56,220,225
- xor r9d,r11d
+ xor r9d,ebp
nop
DB 102,15,56,220,233
mov DWORD[((0+12))+rsp],r9d
@@ -1331,7 +1330,7 @@ DB 102,68,15,56,220,201
bswap r9d
DB 102,15,56,220,208
DB 102,15,56,220,216
- xor r9d,r11d
+ xor r9d,ebp
DB 0x66,0x90
DB 102,15,56,220,224
DB 102,15,56,220,232
@@ -1345,7 +1344,7 @@ DB 102,68,15,56,220,200
bswap r9d
DB 102,15,56,220,209
DB 102,15,56,220,217
- xor r9d,r11d
+ xor r9d,ebp
DB 0x66,0x90
DB 102,15,56,220,225
DB 102,15,56,220,233
@@ -1359,7 +1358,7 @@ DB 102,68,15,56,220,201
bswap r9d
DB 102,15,56,220,208
DB 102,15,56,220,216
- xor r9d,r11d
+ xor r9d,ebp
DB 0x66,0x90
DB 102,15,56,220,224
DB 102,15,56,220,232
@@ -1373,7 +1372,7 @@ DB 102,68,15,56,220,200
bswap r9d
DB 102,15,56,220,209
DB 102,15,56,220,217
- xor r9d,r11d
+ xor r9d,ebp
DB 0x66,0x90
DB 102,15,56,220,225
DB 102,15,56,220,233
@@ -1387,7 +1386,7 @@ DB 102,68,15,56,220,201
bswap r9d
DB 102,15,56,220,208
DB 102,15,56,220,216
- xor r9d,r11d
+ xor r9d,ebp
DB 0x66,0x90
DB 102,15,56,220,224
DB 102,15,56,220,232
@@ -1401,7 +1400,7 @@ DB 102,68,15,56,220,200
bswap r9d
DB 102,15,56,220,209
DB 102,15,56,220,217
- xor r9d,r11d
+ xor r9d,ebp
DB 0x66,0x90
DB 102,15,56,220,225
DB 102,15,56,220,233
@@ -1416,7 +1415,7 @@ DB 102,68,15,56,220,201
DB 102,15,56,220,208
DB 102,15,56,220,216
DB 102,15,56,220,224
- xor r9d,r11d
+ xor r9d,ebp
movdqu xmm10,XMMWORD[rdi]
DB 102,15,56,220,232
mov DWORD[((112+12))+rsp],r9d
@@ -1651,32 +1650,32 @@ DB 102,15,56,221,225
$L$ctr32_done:
xorps xmm0,xmm0
- xor r11d,r11d
+ xor ebp,ebp
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
- movaps xmm6,XMMWORD[((-160))+rbp]
- movaps XMMWORD[(-160)+rbp],xmm0
- movaps xmm7,XMMWORD[((-144))+rbp]
- movaps XMMWORD[(-144)+rbp],xmm0
- movaps xmm8,XMMWORD[((-128))+rbp]
- movaps XMMWORD[(-128)+rbp],xmm0
- movaps xmm9,XMMWORD[((-112))+rbp]
- movaps XMMWORD[(-112)+rbp],xmm0
- movaps xmm10,XMMWORD[((-96))+rbp]
- movaps XMMWORD[(-96)+rbp],xmm0
- movaps xmm11,XMMWORD[((-80))+rbp]
- movaps XMMWORD[(-80)+rbp],xmm0
- movaps xmm12,XMMWORD[((-64))+rbp]
- movaps XMMWORD[(-64)+rbp],xmm0
- movaps xmm13,XMMWORD[((-48))+rbp]
- movaps XMMWORD[(-48)+rbp],xmm0
- movaps xmm14,XMMWORD[((-32))+rbp]
- movaps XMMWORD[(-32)+rbp],xmm0
- movaps xmm15,XMMWORD[((-16))+rbp]
- movaps XMMWORD[(-16)+rbp],xmm0
+ movaps xmm6,XMMWORD[((-168))+r11]
+ movaps XMMWORD[(-168)+r11],xmm0
+ movaps xmm7,XMMWORD[((-152))+r11]
+ movaps XMMWORD[(-152)+r11],xmm0
+ movaps xmm8,XMMWORD[((-136))+r11]
+ movaps XMMWORD[(-136)+r11],xmm0
+ movaps xmm9,XMMWORD[((-120))+r11]
+ movaps XMMWORD[(-120)+r11],xmm0
+ movaps xmm10,XMMWORD[((-104))+r11]
+ movaps XMMWORD[(-104)+r11],xmm0
+ movaps xmm11,XMMWORD[((-88))+r11]
+ movaps XMMWORD[(-88)+r11],xmm0
+ movaps xmm12,XMMWORD[((-72))+r11]
+ movaps XMMWORD[(-72)+r11],xmm0
+ movaps xmm13,XMMWORD[((-56))+r11]
+ movaps XMMWORD[(-56)+r11],xmm0
+ movaps xmm14,XMMWORD[((-40))+r11]
+ movaps XMMWORD[(-40)+r11],xmm0
+ movaps xmm15,XMMWORD[((-24))+r11]
+ movaps XMMWORD[(-24)+r11],xmm0
movaps XMMWORD[rsp],xmm0
movaps XMMWORD[16+rsp],xmm0
movaps XMMWORD[32+rsp],xmm0
@@ -1685,8 +1684,8 @@ $L$ctr32_done:
movaps XMMWORD[80+rsp],xmm0
movaps XMMWORD[96+rsp],xmm0
movaps XMMWORD[112+rsp],xmm0
- lea rsp,[rbp]
- pop rbp
+ mov rbp,QWORD[((-8))+r11]
+ lea rsp,[r11]
$L$ctr32_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -1708,22 +1707,21 @@ $L$SEH_begin_aesni_xts_encrypt:
mov r9,QWORD[48+rsp]
- lea rax,[rsp]
+ lea r11,[rsp]
push rbp
sub rsp,272
and rsp,-16
- movaps XMMWORD[(-168)+rax],xmm6
- movaps XMMWORD[(-152)+rax],xmm7
- movaps XMMWORD[(-136)+rax],xmm8
- movaps XMMWORD[(-120)+rax],xmm9
- movaps XMMWORD[(-104)+rax],xmm10
- movaps XMMWORD[(-88)+rax],xmm11
- movaps XMMWORD[(-72)+rax],xmm12
- movaps XMMWORD[(-56)+rax],xmm13
- movaps XMMWORD[(-40)+rax],xmm14
- movaps XMMWORD[(-24)+rax],xmm15
+ movaps XMMWORD[(-168)+r11],xmm6
+ movaps XMMWORD[(-152)+r11],xmm7
+ movaps XMMWORD[(-136)+r11],xmm8
+ movaps XMMWORD[(-120)+r11],xmm9
+ movaps XMMWORD[(-104)+r11],xmm10
+ movaps XMMWORD[(-88)+r11],xmm11
+ movaps XMMWORD[(-72)+r11],xmm12
+ movaps XMMWORD[(-56)+r11],xmm13
+ movaps XMMWORD[(-40)+r11],xmm14
+ movaps XMMWORD[(-24)+r11],xmm15
$L$xts_enc_body:
- lea rbp,[((-8))+rax]
movups xmm2,XMMWORD[r9]
mov eax,DWORD[240+r8]
mov r10d,DWORD[240+rcx]
@@ -1739,7 +1737,7 @@ DB 102,15,56,220,209
jnz NEAR $L$oop_enc1_8
DB 102,15,56,221,209
movups xmm0,XMMWORD[rcx]
- mov r11,rcx
+ mov rbp,rcx
mov eax,r10d
shl r10d,4
mov r9,rdx
@@ -1795,9 +1793,9 @@ DB 102,15,56,221,209
jc NEAR $L$xts_enc_short
mov eax,16+96
- lea rcx,[32+r10*1+r11]
+ lea rcx,[32+r10*1+rbp]
sub rax,r10
- movups xmm1,XMMWORD[16+r11]
+ movups xmm1,XMMWORD[16+rbp]
mov r10,rax
lea r8,[$L$xts_magic]
jmp NEAR $L$xts_enc_grandloop
@@ -1822,7 +1820,7 @@ DB 102,15,56,220,225
movdqa xmm9,XMMWORD[96+rsp]
pxor xmm6,xmm14
DB 102,15,56,220,233
- movups xmm0,XMMWORD[32+r11]
+ movups xmm0,XMMWORD[32+rbp]
lea rdi,[96+rdi]
pxor xmm7,xmm8
@@ -1831,7 +1829,7 @@ DB 102,15,56,220,241
pxor xmm11,xmm9
movdqa XMMWORD[rsp],xmm10
DB 102,15,56,220,249
- movups xmm1,XMMWORD[48+r11]
+ movups xmm1,XMMWORD[48+rbp]
pxor xmm12,xmm9
DB 102,15,56,220,208
@@ -1846,7 +1844,7 @@ DB 102,15,56,220,232
movdqa XMMWORD[64+rsp],xmm14
DB 102,15,56,220,240
DB 102,15,56,220,248
- movups xmm0,XMMWORD[64+r11]
+ movups xmm0,XMMWORD[64+rbp]
movdqa XMMWORD[80+rsp],xmm8
pshufd xmm9,xmm15,0x5f
jmp NEAR $L$xts_enc_loop6
@@ -1878,7 +1876,7 @@ DB 102,15,56,220,209
psrad xmm14,31
DB 102,15,56,220,217
pand xmm14,xmm8
- movups xmm10,XMMWORD[r11]
+ movups xmm10,XMMWORD[rbp]
DB 102,15,56,220,225
DB 102,15,56,220,233
DB 102,15,56,220,241
@@ -1946,10 +1944,10 @@ DB 102,15,56,220,217
DB 102,15,56,220,225
DB 102,15,56,220,233
pxor xmm15,xmm0
- movups xmm0,XMMWORD[r11]
+ movups xmm0,XMMWORD[rbp]
DB 102,15,56,220,241
DB 102,15,56,220,249
- movups xmm1,XMMWORD[16+r11]
+ movups xmm1,XMMWORD[16+rbp]
pxor xmm14,xmm15
DB 102,15,56,221,84,36,0
@@ -1976,7 +1974,7 @@ DB 102,15,56,221,124,36,80
mov eax,16+96
sub eax,r10d
- mov rcx,r11
+ mov rcx,rbp
shr eax,4
$L$xts_enc_short:
@@ -2132,7 +2130,7 @@ $L$xts_enc_steal:
jnz NEAR $L$xts_enc_steal
sub rsi,r9
- mov rcx,r11
+ mov rcx,rbp
mov eax,r10d
movups xmm2,XMMWORD[((-16))+rsi]
@@ -2158,26 +2156,26 @@ $L$xts_enc_ret:
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
- movaps xmm6,XMMWORD[((-160))+rbp]
- movaps XMMWORD[(-160)+rbp],xmm0
- movaps xmm7,XMMWORD[((-144))+rbp]
- movaps XMMWORD[(-144)+rbp],xmm0
- movaps xmm8,XMMWORD[((-128))+rbp]
- movaps XMMWORD[(-128)+rbp],xmm0
- movaps xmm9,XMMWORD[((-112))+rbp]
- movaps XMMWORD[(-112)+rbp],xmm0
- movaps xmm10,XMMWORD[((-96))+rbp]
- movaps XMMWORD[(-96)+rbp],xmm0
- movaps xmm11,XMMWORD[((-80))+rbp]
- movaps XMMWORD[(-80)+rbp],xmm0
- movaps xmm12,XMMWORD[((-64))+rbp]
- movaps XMMWORD[(-64)+rbp],xmm0
- movaps xmm13,XMMWORD[((-48))+rbp]
- movaps XMMWORD[(-48)+rbp],xmm0
- movaps xmm14,XMMWORD[((-32))+rbp]
- movaps XMMWORD[(-32)+rbp],xmm0
- movaps xmm15,XMMWORD[((-16))+rbp]
- movaps XMMWORD[(-16)+rbp],xmm0
+ movaps xmm6,XMMWORD[((-168))+r11]
+ movaps XMMWORD[(-168)+r11],xmm0
+ movaps xmm7,XMMWORD[((-152))+r11]
+ movaps XMMWORD[(-152)+r11],xmm0
+ movaps xmm8,XMMWORD[((-136))+r11]
+ movaps XMMWORD[(-136)+r11],xmm0
+ movaps xmm9,XMMWORD[((-120))+r11]
+ movaps XMMWORD[(-120)+r11],xmm0
+ movaps xmm10,XMMWORD[((-104))+r11]
+ movaps XMMWORD[(-104)+r11],xmm0
+ movaps xmm11,XMMWORD[((-88))+r11]
+ movaps XMMWORD[(-88)+r11],xmm0
+ movaps xmm12,XMMWORD[((-72))+r11]
+ movaps XMMWORD[(-72)+r11],xmm0
+ movaps xmm13,XMMWORD[((-56))+r11]
+ movaps XMMWORD[(-56)+r11],xmm0
+ movaps xmm14,XMMWORD[((-40))+r11]
+ movaps XMMWORD[(-40)+r11],xmm0
+ movaps xmm15,XMMWORD[((-24))+r11]
+ movaps XMMWORD[(-24)+r11],xmm0
movaps XMMWORD[rsp],xmm0
movaps XMMWORD[16+rsp],xmm0
movaps XMMWORD[32+rsp],xmm0
@@ -2185,8 +2183,8 @@ $L$xts_enc_ret:
movaps XMMWORD[64+rsp],xmm0
movaps XMMWORD[80+rsp],xmm0
movaps XMMWORD[96+rsp],xmm0
- lea rsp,[rbp]
- pop rbp
+ mov rbp,QWORD[((-8))+r11]
+ lea rsp,[r11]
$L$xts_enc_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -2208,22 +2206,21 @@ $L$SEH_begin_aesni_xts_decrypt:
mov r9,QWORD[48+rsp]
- lea rax,[rsp]
+ lea r11,[rsp]
push rbp
sub rsp,272
and rsp,-16
- movaps XMMWORD[(-168)+rax],xmm6
- movaps XMMWORD[(-152)+rax],xmm7
- movaps XMMWORD[(-136)+rax],xmm8
- movaps XMMWORD[(-120)+rax],xmm9
- movaps XMMWORD[(-104)+rax],xmm10
- movaps XMMWORD[(-88)+rax],xmm11
- movaps XMMWORD[(-72)+rax],xmm12
- movaps XMMWORD[(-56)+rax],xmm13
- movaps XMMWORD[(-40)+rax],xmm14
- movaps XMMWORD[(-24)+rax],xmm15
+ movaps XMMWORD[(-168)+r11],xmm6
+ movaps XMMWORD[(-152)+r11],xmm7
+ movaps XMMWORD[(-136)+r11],xmm8
+ movaps XMMWORD[(-120)+r11],xmm9
+ movaps XMMWORD[(-104)+r11],xmm10
+ movaps XMMWORD[(-88)+r11],xmm11
+ movaps XMMWORD[(-72)+r11],xmm12
+ movaps XMMWORD[(-56)+r11],xmm13
+ movaps XMMWORD[(-40)+r11],xmm14
+ movaps XMMWORD[(-24)+r11],xmm15
$L$xts_dec_body:
- lea rbp,[((-8))+rax]
movups xmm2,XMMWORD[r9]
mov eax,DWORD[240+r8]
mov r10d,DWORD[240+rcx]
@@ -2245,7 +2242,7 @@ DB 102,15,56,221,209
sub rdx,rax
movups xmm0,XMMWORD[rcx]
- mov r11,rcx
+ mov rbp,rcx
mov eax,r10d
shl r10d,4
mov r9,rdx
@@ -2301,9 +2298,9 @@ DB 102,15,56,221,209
jc NEAR $L$xts_dec_short
mov eax,16+96
- lea rcx,[32+r10*1+r11]
+ lea rcx,[32+r10*1+rbp]
sub rax,r10
- movups xmm1,XMMWORD[16+r11]
+ movups xmm1,XMMWORD[16+rbp]
mov r10,rax
lea r8,[$L$xts_magic]
jmp NEAR $L$xts_dec_grandloop
@@ -2328,7 +2325,7 @@ DB 102,15,56,222,225
movdqa xmm9,XMMWORD[96+rsp]
pxor xmm6,xmm14
DB 102,15,56,222,233
- movups xmm0,XMMWORD[32+r11]
+ movups xmm0,XMMWORD[32+rbp]
lea rdi,[96+rdi]
pxor xmm7,xmm8
@@ -2337,7 +2334,7 @@ DB 102,15,56,222,241
pxor xmm11,xmm9
movdqa XMMWORD[rsp],xmm10
DB 102,15,56,222,249
- movups xmm1,XMMWORD[48+r11]
+ movups xmm1,XMMWORD[48+rbp]
pxor xmm12,xmm9
DB 102,15,56,222,208
@@ -2352,7 +2349,7 @@ DB 102,15,56,222,232
movdqa XMMWORD[64+rsp],xmm14
DB 102,15,56,222,240
DB 102,15,56,222,248
- movups xmm0,XMMWORD[64+r11]
+ movups xmm0,XMMWORD[64+rbp]
movdqa XMMWORD[80+rsp],xmm8
pshufd xmm9,xmm15,0x5f
jmp NEAR $L$xts_dec_loop6
@@ -2384,7 +2381,7 @@ DB 102,15,56,222,209
psrad xmm14,31
DB 102,15,56,222,217
pand xmm14,xmm8
- movups xmm10,XMMWORD[r11]
+ movups xmm10,XMMWORD[rbp]
DB 102,15,56,222,225
DB 102,15,56,222,233
DB 102,15,56,222,241
@@ -2452,10 +2449,10 @@ DB 102,15,56,222,217
DB 102,15,56,222,225
DB 102,15,56,222,233
pxor xmm15,xmm0
- movups xmm0,XMMWORD[r11]
+ movups xmm0,XMMWORD[rbp]
DB 102,15,56,222,241
DB 102,15,56,222,249
- movups xmm1,XMMWORD[16+r11]
+ movups xmm1,XMMWORD[16+rbp]
pxor xmm14,xmm15
DB 102,15,56,223,84,36,0
@@ -2482,7 +2479,7 @@ DB 102,15,56,223,124,36,80
mov eax,16+96
sub eax,r10d
- mov rcx,r11
+ mov rcx,rbp
shr eax,4
$L$xts_dec_short:
@@ -2639,7 +2636,7 @@ $L$xts_dec_done:
jz NEAR $L$xts_dec_ret
$L$xts_dec_done2:
mov rdx,r9
- mov rcx,r11
+ mov rcx,rbp
mov eax,r10d
movups xmm2,XMMWORD[rdi]
@@ -2669,7 +2666,7 @@ $L$xts_dec_steal:
jnz NEAR $L$xts_dec_steal
sub rsi,r9
- mov rcx,r11
+ mov rcx,rbp
mov eax,r10d
movups xmm2,XMMWORD[rsi]
@@ -2695,26 +2692,26 @@ $L$xts_dec_ret:
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
- movaps xmm6,XMMWORD[((-160))+rbp]
- movaps XMMWORD[(-160)+rbp],xmm0
- movaps xmm7,XMMWORD[((-144))+rbp]
- movaps XMMWORD[(-144)+rbp],xmm0
- movaps xmm8,XMMWORD[((-128))+rbp]
- movaps XMMWORD[(-128)+rbp],xmm0
- movaps xmm9,XMMWORD[((-112))+rbp]
- movaps XMMWORD[(-112)+rbp],xmm0
- movaps xmm10,XMMWORD[((-96))+rbp]
- movaps XMMWORD[(-96)+rbp],xmm0
- movaps xmm11,XMMWORD[((-80))+rbp]
- movaps XMMWORD[(-80)+rbp],xmm0
- movaps xmm12,XMMWORD[((-64))+rbp]
- movaps XMMWORD[(-64)+rbp],xmm0
- movaps xmm13,XMMWORD[((-48))+rbp]
- movaps XMMWORD[(-48)+rbp],xmm0
- movaps xmm14,XMMWORD[((-32))+rbp]
- movaps XMMWORD[(-32)+rbp],xmm0
- movaps xmm15,XMMWORD[((-16))+rbp]
- movaps XMMWORD[(-16)+rbp],xmm0
+ movaps xmm6,XMMWORD[((-168))+r11]
+ movaps XMMWORD[(-168)+r11],xmm0
+ movaps xmm7,XMMWORD[((-152))+r11]
+ movaps XMMWORD[(-152)+r11],xmm0
+ movaps xmm8,XMMWORD[((-136))+r11]
+ movaps XMMWORD[(-136)+r11],xmm0
+ movaps xmm9,XMMWORD[((-120))+r11]
+ movaps XMMWORD[(-120)+r11],xmm0
+ movaps xmm10,XMMWORD[((-104))+r11]
+ movaps XMMWORD[(-104)+r11],xmm0
+ movaps xmm11,XMMWORD[((-88))+r11]
+ movaps XMMWORD[(-88)+r11],xmm0
+ movaps xmm12,XMMWORD[((-72))+r11]
+ movaps XMMWORD[(-72)+r11],xmm0
+ movaps xmm13,XMMWORD[((-56))+r11]
+ movaps XMMWORD[(-56)+r11],xmm0
+ movaps xmm14,XMMWORD[((-40))+r11]
+ movaps XMMWORD[(-40)+r11],xmm0
+ movaps xmm15,XMMWORD[((-24))+r11]
+ movaps XMMWORD[(-24)+r11],xmm0
movaps XMMWORD[rsp],xmm0
movaps XMMWORD[16+rsp],xmm0
movaps XMMWORD[32+rsp],xmm0
@@ -2722,13 +2719,901 @@ $L$xts_dec_ret:
movaps XMMWORD[64+rsp],xmm0
movaps XMMWORD[80+rsp],xmm0
movaps XMMWORD[96+rsp],xmm0
- lea rsp,[rbp]
- pop rbp
+ mov rbp,QWORD[((-8))+r11]
+ lea rsp,[r11]
$L$xts_dec_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_aesni_xts_decrypt:
+global aesni_ocb_encrypt
+
+ALIGN 32
+aesni_ocb_encrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesni_ocb_encrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+ lea rax,[rsp]
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ lea rsp,[((-160))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[64+rsp],xmm10
+ movaps XMMWORD[80+rsp],xmm11
+ movaps XMMWORD[96+rsp],xmm12
+ movaps XMMWORD[112+rsp],xmm13
+ movaps XMMWORD[128+rsp],xmm14
+ movaps XMMWORD[144+rsp],xmm15
+$L$ocb_enc_body:
+ mov rbx,QWORD[56+rax]
+ mov rbp,QWORD[((56+8))+rax]
+
+ mov r10d,DWORD[240+rcx]
+ mov r11,rcx
+ shl r10d,4
+ movups xmm9,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+r10*1+rcx]
+
+ movdqu xmm15,XMMWORD[r9]
+ pxor xmm9,xmm1
+ pxor xmm15,xmm1
+
+ mov eax,16+32
+ lea rcx,[32+r10*1+r11]
+ movups xmm1,XMMWORD[16+r11]
+ sub rax,r10
+ mov r10,rax
+
+ movdqu xmm10,XMMWORD[rbx]
+ movdqu xmm8,XMMWORD[rbp]
+
+ test r8,1
+ jnz NEAR $L$ocb_enc_odd
+
+ bsf r12,r8
+ add r8,1
+ shl r12,4
+ movdqu xmm7,XMMWORD[r12*1+rbx]
+ movdqu xmm2,XMMWORD[rdi]
+ lea rdi,[16+rdi]
+
+ call __ocb_encrypt1
+
+ movdqa xmm15,xmm7
+ movups XMMWORD[rsi],xmm2
+ lea rsi,[16+rsi]
+ sub rdx,1
+ jz NEAR $L$ocb_enc_done
+
+$L$ocb_enc_odd:
+ lea r12,[1+r8]
+ lea r13,[3+r8]
+ lea r14,[5+r8]
+ lea r8,[6+r8]
+ bsf r12,r12
+ bsf r13,r13
+ bsf r14,r14
+ shl r12,4
+ shl r13,4
+ shl r14,4
+
+ sub rdx,6
+ jc NEAR $L$ocb_enc_short
+ jmp NEAR $L$ocb_enc_grandloop
+
+ALIGN 32
+$L$ocb_enc_grandloop:
+ movdqu xmm2,XMMWORD[rdi]
+ movdqu xmm3,XMMWORD[16+rdi]
+ movdqu xmm4,XMMWORD[32+rdi]
+ movdqu xmm5,XMMWORD[48+rdi]
+ movdqu xmm6,XMMWORD[64+rdi]
+ movdqu xmm7,XMMWORD[80+rdi]
+ lea rdi,[96+rdi]
+
+ call __ocb_encrypt6
+
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ movups XMMWORD[64+rsi],xmm6
+ movups XMMWORD[80+rsi],xmm7
+ lea rsi,[96+rsi]
+ sub rdx,6
+ jnc NEAR $L$ocb_enc_grandloop
+
+$L$ocb_enc_short:
+ add rdx,6
+ jz NEAR $L$ocb_enc_done
+
+ movdqu xmm2,XMMWORD[rdi]
+ cmp rdx,2
+ jb NEAR $L$ocb_enc_one
+ movdqu xmm3,XMMWORD[16+rdi]
+ je NEAR $L$ocb_enc_two
+
+ movdqu xmm4,XMMWORD[32+rdi]
+ cmp rdx,4
+ jb NEAR $L$ocb_enc_three
+ movdqu xmm5,XMMWORD[48+rdi]
+ je NEAR $L$ocb_enc_four
+
+ movdqu xmm6,XMMWORD[64+rdi]
+ pxor xmm7,xmm7
+
+ call __ocb_encrypt6
+
+ movdqa xmm15,xmm14
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+ movups XMMWORD[64+rsi],xmm6
+
+ jmp NEAR $L$ocb_enc_done
+
+ALIGN 16
+$L$ocb_enc_one:
+ movdqa xmm7,xmm10
+
+ call __ocb_encrypt1
+
+ movdqa xmm15,xmm7
+ movups XMMWORD[rsi],xmm2
+ jmp NEAR $L$ocb_enc_done
+
+ALIGN 16
+$L$ocb_enc_two:
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+
+ call __ocb_encrypt4
+
+ movdqa xmm15,xmm11
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+
+ jmp NEAR $L$ocb_enc_done
+
+ALIGN 16
+$L$ocb_enc_three:
+ pxor xmm5,xmm5
+
+ call __ocb_encrypt4
+
+ movdqa xmm15,xmm12
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+
+ jmp NEAR $L$ocb_enc_done
+
+ALIGN 16
+$L$ocb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa xmm15,xmm13
+ movups XMMWORD[rsi],xmm2
+ movups XMMWORD[16+rsi],xmm3
+ movups XMMWORD[32+rsi],xmm4
+ movups XMMWORD[48+rsi],xmm5
+
+$L$ocb_enc_done:
+ pxor xmm15,xmm0
+ movdqu XMMWORD[rbp],xmm8
+ movdqu XMMWORD[r9],xmm15
+
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movaps xmm6,XMMWORD[rsp]
+ movaps XMMWORD[rsp],xmm0
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps XMMWORD[16+rsp],xmm0
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps XMMWORD[32+rsp],xmm0
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps XMMWORD[48+rsp],xmm0
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps XMMWORD[64+rsp],xmm0
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps XMMWORD[80+rsp],xmm0
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps XMMWORD[96+rsp],xmm0
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps XMMWORD[112+rsp],xmm0
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps XMMWORD[128+rsp],xmm0
+ movaps xmm15,XMMWORD[144+rsp]
+ movaps XMMWORD[144+rsp],xmm0
+ lea rax,[((160+40))+rsp]
+$L$ocb_enc_pop:
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov rbx,QWORD[((-8))+rax]
+ lea rsp,[rax]
+$L$ocb_enc_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_aesni_ocb_encrypt:
+
+
+ALIGN 32
+__ocb_encrypt6:
+ pxor xmm15,xmm9
+ movdqu xmm11,XMMWORD[r12*1+rbx]
+ movdqa xmm12,xmm10
+ movdqu xmm13,XMMWORD[r13*1+rbx]
+ movdqa xmm14,xmm10
+ pxor xmm10,xmm15
+ movdqu xmm15,XMMWORD[r14*1+rbx]
+ pxor xmm11,xmm10
+ pxor xmm8,xmm2
+ pxor xmm2,xmm10
+ pxor xmm12,xmm11
+ pxor xmm8,xmm3
+ pxor xmm3,xmm11
+ pxor xmm13,xmm12
+ pxor xmm8,xmm4
+ pxor xmm4,xmm12
+ pxor xmm14,xmm13
+ pxor xmm8,xmm5
+ pxor xmm5,xmm13
+ pxor xmm15,xmm14
+ pxor xmm8,xmm6
+ pxor xmm6,xmm14
+ pxor xmm8,xmm7
+ pxor xmm7,xmm15
+ movups xmm0,XMMWORD[32+r11]
+
+ lea r12,[1+r8]
+ lea r13,[3+r8]
+ lea r14,[5+r8]
+ add r8,6
+ pxor xmm10,xmm9
+ bsf r12,r12
+ bsf r13,r13
+ bsf r14,r14
+
+DB 102,15,56,220,209
+DB 102,15,56,220,217
+DB 102,15,56,220,225
+DB 102,15,56,220,233
+ pxor xmm11,xmm9
+ pxor xmm12,xmm9
+DB 102,15,56,220,241
+ pxor xmm13,xmm9
+ pxor xmm14,xmm9
+DB 102,15,56,220,249
+ movups xmm1,XMMWORD[48+r11]
+ pxor xmm15,xmm9
+
+DB 102,15,56,220,208
+DB 102,15,56,220,216
+DB 102,15,56,220,224
+DB 102,15,56,220,232
+DB 102,15,56,220,240
+DB 102,15,56,220,248
+ movups xmm0,XMMWORD[64+r11]
+ shl r12,4
+ shl r13,4
+ jmp NEAR $L$ocb_enc_loop6
+
+ALIGN 32
+$L$ocb_enc_loop6:
+DB 102,15,56,220,209
+DB 102,15,56,220,217
+DB 102,15,56,220,225
+DB 102,15,56,220,233
+DB 102,15,56,220,241
+DB 102,15,56,220,249
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+
+DB 102,15,56,220,208
+DB 102,15,56,220,216
+DB 102,15,56,220,224
+DB 102,15,56,220,232
+DB 102,15,56,220,240
+DB 102,15,56,220,248
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$ocb_enc_loop6
+
+DB 102,15,56,220,209
+DB 102,15,56,220,217
+DB 102,15,56,220,225
+DB 102,15,56,220,233
+DB 102,15,56,220,241
+DB 102,15,56,220,249
+ movups xmm1,XMMWORD[16+r11]
+ shl r14,4
+
+DB 102,65,15,56,221,210
+ movdqu xmm10,XMMWORD[rbx]
+ mov rax,r10
+DB 102,65,15,56,221,219
+DB 102,65,15,56,221,228
+DB 102,65,15,56,221,237
+DB 102,65,15,56,221,246
+DB 102,65,15,56,221,255
+ DB 0F3h,0C3h ;repret
+
+
+
+ALIGN 32
+__ocb_encrypt4:
+ pxor xmm15,xmm9
+ movdqu xmm11,XMMWORD[r12*1+rbx]
+ movdqa xmm12,xmm10
+ movdqu xmm13,XMMWORD[r13*1+rbx]
+ pxor xmm10,xmm15
+ pxor xmm11,xmm10
+ pxor xmm8,xmm2
+ pxor xmm2,xmm10
+ pxor xmm12,xmm11
+ pxor xmm8,xmm3
+ pxor xmm3,xmm11
+ pxor xmm13,xmm12
+ pxor xmm8,xmm4
+ pxor xmm4,xmm12
+ pxor xmm8,xmm5
+ pxor xmm5,xmm13
+ movups xmm0,XMMWORD[32+r11]
+
+ pxor xmm10,xmm9
+ pxor xmm11,xmm9
+ pxor xmm12,xmm9
+ pxor xmm13,xmm9
+
+DB 102,15,56,220,209
+DB 102,15,56,220,217
+DB 102,15,56,220,225
+DB 102,15,56,220,233
+ movups xmm1,XMMWORD[48+r11]
+
+DB 102,15,56,220,208
+DB 102,15,56,220,216
+DB 102,15,56,220,224
+DB 102,15,56,220,232
+ movups xmm0,XMMWORD[64+r11]
+ jmp NEAR $L$ocb_enc_loop4
+
+ALIGN 32
+$L$ocb_enc_loop4:
+DB 102,15,56,220,209
+DB 102,15,56,220,217
+DB 102,15,56,220,225
+DB 102,15,56,220,233
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+
+DB 102,15,56,220,208
+DB 102,15,56,220,216
+DB 102,15,56,220,224
+DB 102,15,56,220,232
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$ocb_enc_loop4
+
+DB 102,15,56,220,209
+DB 102,15,56,220,217
+DB 102,15,56,220,225
+DB 102,15,56,220,233
+ movups xmm1,XMMWORD[16+r11]
+ mov rax,r10
+
+DB 102,65,15,56,221,210
+DB 102,65,15,56,221,219
+DB 102,65,15,56,221,228
+DB 102,65,15,56,221,237
+ DB 0F3h,0C3h ;repret
+
+
+
+ALIGN 32
+__ocb_encrypt1:
+ pxor xmm7,xmm15
+ pxor xmm7,xmm9
+ pxor xmm8,xmm2
+ pxor xmm2,xmm7
+ movups xmm0,XMMWORD[32+r11]
+
+DB 102,15,56,220,209
+ movups xmm1,XMMWORD[48+r11]
+ pxor xmm7,xmm9
+
+DB 102,15,56,220,208
+ movups xmm0,XMMWORD[64+r11]
+ jmp NEAR $L$ocb_enc_loop1
+
+ALIGN 32
+$L$ocb_enc_loop1:
+DB 102,15,56,220,209
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+
+DB 102,15,56,220,208
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$ocb_enc_loop1
+
+DB 102,15,56,220,209
+ movups xmm1,XMMWORD[16+r11]
+ mov rax,r10
+
+DB 102,15,56,221,215
+ DB 0F3h,0C3h ;repret
+
+
+global aesni_ocb_decrypt
+
+ALIGN 32
+aesni_ocb_decrypt:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_aesni_ocb_decrypt:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+ mov rcx,r9
+ mov r8,QWORD[40+rsp]
+ mov r9,QWORD[48+rsp]
+
+
+ lea rax,[rsp]
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ lea rsp,[((-160))+rsp]
+ movaps XMMWORD[rsp],xmm6
+ movaps XMMWORD[16+rsp],xmm7
+ movaps XMMWORD[32+rsp],xmm8
+ movaps XMMWORD[48+rsp],xmm9
+ movaps XMMWORD[64+rsp],xmm10
+ movaps XMMWORD[80+rsp],xmm11
+ movaps XMMWORD[96+rsp],xmm12
+ movaps XMMWORD[112+rsp],xmm13
+ movaps XMMWORD[128+rsp],xmm14
+ movaps XMMWORD[144+rsp],xmm15
+$L$ocb_dec_body:
+ mov rbx,QWORD[56+rax]
+ mov rbp,QWORD[((56+8))+rax]
+
+ mov r10d,DWORD[240+rcx]
+ mov r11,rcx
+ shl r10d,4
+ movups xmm9,XMMWORD[rcx]
+ movups xmm1,XMMWORD[16+r10*1+rcx]
+
+ movdqu xmm15,XMMWORD[r9]
+ pxor xmm9,xmm1
+ pxor xmm15,xmm1
+
+ mov eax,16+32
+ lea rcx,[32+r10*1+r11]
+ movups xmm1,XMMWORD[16+r11]
+ sub rax,r10
+ mov r10,rax
+
+ movdqu xmm10,XMMWORD[rbx]
+ movdqu xmm8,XMMWORD[rbp]
+
+ test r8,1
+ jnz NEAR $L$ocb_dec_odd
+
+ bsf r12,r8
+ add r8,1
+ shl r12,4
+ movdqu xmm7,XMMWORD[r12*1+rbx]
+ movdqu xmm2,XMMWORD[rdi]
+ lea rdi,[16+rdi]
+
+ call __ocb_decrypt1
+
+ movdqa xmm15,xmm7
+ movups XMMWORD[rsi],xmm2
+ xorps xmm8,xmm2
+ lea rsi,[16+rsi]
+ sub rdx,1
+ jz NEAR $L$ocb_dec_done
+
+$L$ocb_dec_odd:
+ lea r12,[1+r8]
+ lea r13,[3+r8]
+ lea r14,[5+r8]
+ lea r8,[6+r8]
+ bsf r12,r12
+ bsf r13,r13
+ bsf r14,r14
+ shl r12,4
+ shl r13,4
+ shl r14,4
+
+ sub rdx,6
+ jc NEAR $L$ocb_dec_short
+ jmp NEAR $L$ocb_dec_grandloop
+
+ALIGN 32
+$L$ocb_dec_grandloop:
+ movdqu xmm2,XMMWORD[rdi]
+ movdqu xmm3,XMMWORD[16+rdi]
+ movdqu xmm4,XMMWORD[32+rdi]
+ movdqu xmm5,XMMWORD[48+rdi]
+ movdqu xmm6,XMMWORD[64+rdi]
+ movdqu xmm7,XMMWORD[80+rdi]
+ lea rdi,[96+rdi]
+
+ call __ocb_decrypt6
+
+ movups XMMWORD[rsi],xmm2
+ pxor xmm8,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm8,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm8,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm8,xmm5
+ movups XMMWORD[64+rsi],xmm6
+ pxor xmm8,xmm6
+ movups XMMWORD[80+rsi],xmm7
+ pxor xmm8,xmm7
+ lea rsi,[96+rsi]
+ sub rdx,6
+ jnc NEAR $L$ocb_dec_grandloop
+
+$L$ocb_dec_short:
+ add rdx,6
+ jz NEAR $L$ocb_dec_done
+
+ movdqu xmm2,XMMWORD[rdi]
+ cmp rdx,2
+ jb NEAR $L$ocb_dec_one
+ movdqu xmm3,XMMWORD[16+rdi]
+ je NEAR $L$ocb_dec_two
+
+ movdqu xmm4,XMMWORD[32+rdi]
+ cmp rdx,4
+ jb NEAR $L$ocb_dec_three
+ movdqu xmm5,XMMWORD[48+rdi]
+ je NEAR $L$ocb_dec_four
+
+ movdqu xmm6,XMMWORD[64+rdi]
+ pxor xmm7,xmm7
+
+ call __ocb_decrypt6
+
+ movdqa xmm15,xmm14
+ movups XMMWORD[rsi],xmm2
+ pxor xmm8,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm8,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm8,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm8,xmm5
+ movups XMMWORD[64+rsi],xmm6
+ pxor xmm8,xmm6
+
+ jmp NEAR $L$ocb_dec_done
+
+ALIGN 16
+$L$ocb_dec_one:
+ movdqa xmm7,xmm10
+
+ call __ocb_decrypt1
+
+ movdqa xmm15,xmm7
+ movups XMMWORD[rsi],xmm2
+ xorps xmm8,xmm2
+ jmp NEAR $L$ocb_dec_done
+
+ALIGN 16
+$L$ocb_dec_two:
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+
+ call __ocb_decrypt4
+
+ movdqa xmm15,xmm11
+ movups XMMWORD[rsi],xmm2
+ xorps xmm8,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ xorps xmm8,xmm3
+
+ jmp NEAR $L$ocb_dec_done
+
+ALIGN 16
+$L$ocb_dec_three:
+ pxor xmm5,xmm5
+
+ call __ocb_decrypt4
+
+ movdqa xmm15,xmm12
+ movups XMMWORD[rsi],xmm2
+ xorps xmm8,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ xorps xmm8,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ xorps xmm8,xmm4
+
+ jmp NEAR $L$ocb_dec_done
+
+ALIGN 16
+$L$ocb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa xmm15,xmm13
+ movups XMMWORD[rsi],xmm2
+ pxor xmm8,xmm2
+ movups XMMWORD[16+rsi],xmm3
+ pxor xmm8,xmm3
+ movups XMMWORD[32+rsi],xmm4
+ pxor xmm8,xmm4
+ movups XMMWORD[48+rsi],xmm5
+ pxor xmm8,xmm5
+
+$L$ocb_dec_done:
+ pxor xmm15,xmm0
+ movdqu XMMWORD[rbp],xmm8
+ movdqu XMMWORD[r9],xmm15
+
+ xorps xmm0,xmm0
+ pxor xmm1,xmm1
+ pxor xmm2,xmm2
+ pxor xmm3,xmm3
+ pxor xmm4,xmm4
+ pxor xmm5,xmm5
+ movaps xmm6,XMMWORD[rsp]
+ movaps XMMWORD[rsp],xmm0
+ movaps xmm7,XMMWORD[16+rsp]
+ movaps XMMWORD[16+rsp],xmm0
+ movaps xmm8,XMMWORD[32+rsp]
+ movaps XMMWORD[32+rsp],xmm0
+ movaps xmm9,XMMWORD[48+rsp]
+ movaps XMMWORD[48+rsp],xmm0
+ movaps xmm10,XMMWORD[64+rsp]
+ movaps XMMWORD[64+rsp],xmm0
+ movaps xmm11,XMMWORD[80+rsp]
+ movaps XMMWORD[80+rsp],xmm0
+ movaps xmm12,XMMWORD[96+rsp]
+ movaps XMMWORD[96+rsp],xmm0
+ movaps xmm13,XMMWORD[112+rsp]
+ movaps XMMWORD[112+rsp],xmm0
+ movaps xmm14,XMMWORD[128+rsp]
+ movaps XMMWORD[128+rsp],xmm0
+ movaps xmm15,XMMWORD[144+rsp]
+ movaps XMMWORD[144+rsp],xmm0
+ lea rax,[((160+40))+rsp]
+$L$ocb_dec_pop:
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov rbx,QWORD[((-8))+rax]
+ lea rsp,[rax]
+$L$ocb_dec_epilogue:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+$L$SEH_end_aesni_ocb_decrypt:
+
+
+ALIGN 32
+__ocb_decrypt6:
+ pxor xmm15,xmm9
+ movdqu xmm11,XMMWORD[r12*1+rbx]
+ movdqa xmm12,xmm10
+ movdqu xmm13,XMMWORD[r13*1+rbx]
+ movdqa xmm14,xmm10
+ pxor xmm10,xmm15
+ movdqu xmm15,XMMWORD[r14*1+rbx]
+ pxor xmm11,xmm10
+ pxor xmm2,xmm10
+ pxor xmm12,xmm11
+ pxor xmm3,xmm11
+ pxor xmm13,xmm12
+ pxor xmm4,xmm12
+ pxor xmm14,xmm13
+ pxor xmm5,xmm13
+ pxor xmm15,xmm14
+ pxor xmm6,xmm14
+ pxor xmm7,xmm15
+ movups xmm0,XMMWORD[32+r11]
+
+ lea r12,[1+r8]
+ lea r13,[3+r8]
+ lea r14,[5+r8]
+ add r8,6
+ pxor xmm10,xmm9
+ bsf r12,r12
+ bsf r13,r13
+ bsf r14,r14
+
+DB 102,15,56,222,209
+DB 102,15,56,222,217
+DB 102,15,56,222,225
+DB 102,15,56,222,233
+ pxor xmm11,xmm9
+ pxor xmm12,xmm9
+DB 102,15,56,222,241
+ pxor xmm13,xmm9
+ pxor xmm14,xmm9
+DB 102,15,56,222,249
+ movups xmm1,XMMWORD[48+r11]
+ pxor xmm15,xmm9
+
+DB 102,15,56,222,208
+DB 102,15,56,222,216
+DB 102,15,56,222,224
+DB 102,15,56,222,232
+DB 102,15,56,222,240
+DB 102,15,56,222,248
+ movups xmm0,XMMWORD[64+r11]
+ shl r12,4
+ shl r13,4
+ jmp NEAR $L$ocb_dec_loop6
+
+ALIGN 32
+$L$ocb_dec_loop6:
+DB 102,15,56,222,209
+DB 102,15,56,222,217
+DB 102,15,56,222,225
+DB 102,15,56,222,233
+DB 102,15,56,222,241
+DB 102,15,56,222,249
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+
+DB 102,15,56,222,208
+DB 102,15,56,222,216
+DB 102,15,56,222,224
+DB 102,15,56,222,232
+DB 102,15,56,222,240
+DB 102,15,56,222,248
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$ocb_dec_loop6
+
+DB 102,15,56,222,209
+DB 102,15,56,222,217
+DB 102,15,56,222,225
+DB 102,15,56,222,233
+DB 102,15,56,222,241
+DB 102,15,56,222,249
+ movups xmm1,XMMWORD[16+r11]
+ shl r14,4
+
+DB 102,65,15,56,223,210
+ movdqu xmm10,XMMWORD[rbx]
+ mov rax,r10
+DB 102,65,15,56,223,219
+DB 102,65,15,56,223,228
+DB 102,65,15,56,223,237
+DB 102,65,15,56,223,246
+DB 102,65,15,56,223,255
+ DB 0F3h,0C3h ;repret
+
+
+
+ALIGN 32
+__ocb_decrypt4:
+ pxor xmm15,xmm9
+ movdqu xmm11,XMMWORD[r12*1+rbx]
+ movdqa xmm12,xmm10
+ movdqu xmm13,XMMWORD[r13*1+rbx]
+ pxor xmm10,xmm15
+ pxor xmm11,xmm10
+ pxor xmm2,xmm10
+ pxor xmm12,xmm11
+ pxor xmm3,xmm11
+ pxor xmm13,xmm12
+ pxor xmm4,xmm12
+ pxor xmm5,xmm13
+ movups xmm0,XMMWORD[32+r11]
+
+ pxor xmm10,xmm9
+ pxor xmm11,xmm9
+ pxor xmm12,xmm9
+ pxor xmm13,xmm9
+
+DB 102,15,56,222,209
+DB 102,15,56,222,217
+DB 102,15,56,222,225
+DB 102,15,56,222,233
+ movups xmm1,XMMWORD[48+r11]
+
+DB 102,15,56,222,208
+DB 102,15,56,222,216
+DB 102,15,56,222,224
+DB 102,15,56,222,232
+ movups xmm0,XMMWORD[64+r11]
+ jmp NEAR $L$ocb_dec_loop4
+
+ALIGN 32
+$L$ocb_dec_loop4:
+DB 102,15,56,222,209
+DB 102,15,56,222,217
+DB 102,15,56,222,225
+DB 102,15,56,222,233
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+
+DB 102,15,56,222,208
+DB 102,15,56,222,216
+DB 102,15,56,222,224
+DB 102,15,56,222,232
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$ocb_dec_loop4
+
+DB 102,15,56,222,209
+DB 102,15,56,222,217
+DB 102,15,56,222,225
+DB 102,15,56,222,233
+ movups xmm1,XMMWORD[16+r11]
+ mov rax,r10
+
+DB 102,65,15,56,223,210
+DB 102,65,15,56,223,219
+DB 102,65,15,56,223,228
+DB 102,65,15,56,223,237
+ DB 0F3h,0C3h ;repret
+
+
+
+ALIGN 32
+__ocb_decrypt1:
+ pxor xmm7,xmm15
+ pxor xmm7,xmm9
+ pxor xmm2,xmm7
+ movups xmm0,XMMWORD[32+r11]
+
+DB 102,15,56,222,209
+ movups xmm1,XMMWORD[48+r11]
+ pxor xmm7,xmm9
+
+DB 102,15,56,222,208
+ movups xmm0,XMMWORD[64+r11]
+ jmp NEAR $L$ocb_dec_loop1
+
+ALIGN 32
+$L$ocb_dec_loop1:
+DB 102,15,56,222,209
+ movups xmm1,XMMWORD[rax*1+rcx]
+ add rax,32
+
+DB 102,15,56,222,208
+ movups xmm0,XMMWORD[((-16))+rax*1+rcx]
+ jnz NEAR $L$ocb_dec_loop1
+
+DB 102,15,56,222,209
+ movups xmm1,XMMWORD[16+r11]
+ mov rax,r10
+
+DB 102,15,56,223,215
+ DB 0F3h,0C3h ;repret
+
global aesni_cbc_encrypt
ALIGN 16
@@ -2837,7 +3722,7 @@ DB 102,15,56,223,209
jmp NEAR $L$cbc_ret
ALIGN 16
$L$cbc_decrypt_bulk:
- lea rax,[rsp]
+ lea r11,[rsp]
push rbp
sub rsp,176
and rsp,-16
@@ -2852,7 +3737,7 @@ $L$cbc_decrypt_bulk:
movaps XMMWORD[144+rsp],xmm14
movaps XMMWORD[160+rsp],xmm15
$L$cbc_decrypt_body:
- lea rbp,[((-8))+rax]
+ mov rbp,rcx
movups xmm10,XMMWORD[r8]
mov eax,r10d
cmp rdx,0x50
@@ -2892,7 +3777,7 @@ $L$cbc_dec_loop8_enter:
pxor xmm3,xmm0
movups xmm1,XMMWORD[((16-112))+rcx]
pxor xmm4,xmm0
- xor r11,r11
+ mov rbp,-1
cmp rdx,0x70
pxor xmm5,xmm0
pxor xmm6,xmm0
@@ -2908,10 +3793,10 @@ DB 102,15,56,222,233
DB 102,15,56,222,241
DB 102,15,56,222,249
DB 102,68,15,56,222,193
- setnc r11b
- shl r11,7
+ adc rbp,0
+ and rbp,128
DB 102,68,15,56,222,201
- add r11,rdi
+ add rbp,rdi
movups xmm1,XMMWORD[((48-112))+rcx]
DB 102,15,56,222,208
DB 102,15,56,222,216
@@ -3049,18 +3934,18 @@ DB 102,65,15,56,223,219
movdqu xmm0,XMMWORD[112+rdi]
DB 102,65,15,56,223,228
lea rdi,[128+rdi]
- movdqu xmm11,XMMWORD[r11]
+ movdqu xmm11,XMMWORD[rbp]
DB 102,65,15,56,223,237
DB 102,65,15,56,223,246
- movdqu xmm12,XMMWORD[16+r11]
- movdqu xmm13,XMMWORD[32+r11]
+ movdqu xmm12,XMMWORD[16+rbp]
+ movdqu xmm13,XMMWORD[32+rbp]
DB 102,65,15,56,223,255
DB 102,68,15,56,223,193
- movdqu xmm14,XMMWORD[48+r11]
- movdqu xmm15,XMMWORD[64+r11]
+ movdqu xmm14,XMMWORD[48+rbp]
+ movdqu xmm15,XMMWORD[64+rbp]
DB 102,69,15,56,223,202
movdqa xmm10,xmm0
- movdqu xmm1,XMMWORD[80+r11]
+ movdqu xmm1,XMMWORD[80+rbp]
movups xmm0,XMMWORD[((-112))+rcx]
movups XMMWORD[rsi],xmm2
@@ -3179,7 +4064,7 @@ $L$cbc_dec_loop6_enter:
pxor xmm5,xmm13
movdqu XMMWORD[32+rsi],xmm4
pxor xmm6,xmm14
- mov rcx,r11
+ mov rcx,rbp
movdqu XMMWORD[48+rsi],xmm5
pxor xmm7,xmm15
mov eax,r10d
@@ -3348,8 +4233,8 @@ $L$cbc_dec_ret:
movaps XMMWORD[144+rsp],xmm0
movaps xmm15,XMMWORD[160+rsp]
movaps XMMWORD[160+rsp],xmm0
- lea rsp,[rbp]
- pop rbp
+ mov rbp,QWORD[((-8))+r11]
+ lea rsp,[r11]
$L$cbc_ret:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -3865,13 +4750,75 @@ ctr_xts_se_handler:
cmp rbx,r10
jae NEAR $L$common_seh_tail
- mov rax,QWORD[160+r8]
- lea rsi,[((-160))+rax]
+ mov rax,QWORD[208+r8]
+
+ lea rsi,[((-168))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ mov rbp,QWORD[((-8))+rax]
+ mov QWORD[160+r8],rbp
+ jmp NEAR $L$common_seh_tail
+
+
+
+ALIGN 16
+ocb_se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ mov r10d,DWORD[8+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$ocb_no_xmm
+
+ mov rax,QWORD[152+r8]
+
+ lea rsi,[rax]
lea rdi,[512+r8]
mov ecx,20
DD 0xa548f3fc
+ lea rax,[((160+40))+rax]
- jmp NEAR $L$common_rbp_tail
+$L$ocb_no_xmm:
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+
+ jmp NEAR $L$common_seh_tail
ALIGN 16
@@ -3894,9 +4841,13 @@ cbc_se_handler:
cmp rbx,r10
jb NEAR $L$common_seh_tail
+ mov rax,QWORD[120+r8]
+
lea r10,[$L$cbc_decrypt_body]
cmp rbx,r10
- jb NEAR $L$restore_cbc_rax
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
lea r10,[$L$cbc_ret]
cmp rbx,r10
@@ -3907,15 +4858,10 @@ cbc_se_handler:
mov ecx,20
DD 0xa548f3fc
-$L$common_rbp_tail:
- mov rax,QWORD[160+r8]
- mov rbp,QWORD[rax]
- lea rax,[8+rax]
- mov QWORD[160+r8],rbp
- jmp NEAR $L$common_seh_tail
+ mov rax,QWORD[208+r8]
-$L$restore_cbc_rax:
- mov rax,QWORD[120+r8]
+ mov rbp,QWORD[((-8))+rax]
+ mov QWORD[160+r8],rbp
$L$common_seh_tail:
mov rdi,QWORD[8+rax]
@@ -3982,6 +4928,14 @@ ALIGN 4
DD $L$SEH_begin_aesni_xts_decrypt wrt ..imagebase
DD $L$SEH_end_aesni_xts_decrypt wrt ..imagebase
DD $L$SEH_info_xts_dec wrt ..imagebase
+
+ DD $L$SEH_begin_aesni_ocb_encrypt wrt ..imagebase
+ DD $L$SEH_end_aesni_ocb_encrypt wrt ..imagebase
+ DD $L$SEH_info_ocb_enc wrt ..imagebase
+
+ DD $L$SEH_begin_aesni_ocb_decrypt wrt ..imagebase
+ DD $L$SEH_end_aesni_ocb_decrypt wrt ..imagebase
+ DD $L$SEH_info_ocb_dec wrt ..imagebase
DD $L$SEH_begin_aesni_cbc_encrypt wrt ..imagebase
DD $L$SEH_end_aesni_cbc_encrypt wrt ..imagebase
DD $L$SEH_info_cbc wrt ..imagebase
@@ -4019,6 +4973,18 @@ $L$SEH_info_xts_dec:
DB 9,0,0,0
DD ctr_xts_se_handler wrt ..imagebase
DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase
+$L$SEH_info_ocb_enc:
+DB 9,0,0,0
+ DD ocb_se_handler wrt ..imagebase
+ DD $L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase
+ DD $L$ocb_enc_pop wrt ..imagebase
+ DD 0
+$L$SEH_info_ocb_dec:
+DB 9,0,0,0
+ DD ocb_se_handler wrt ..imagebase
+ DD $L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase
+ DD $L$ocb_dec_pop wrt ..imagebase
+ DD 0
$L$SEH_info_cbc:
DB 9,0,0,0
DD cbc_se_handler wrt ..imagebase
diff --git a/win-x86_64/crypto/aes/bsaes-x86_64.asm b/win-x86_64/crypto/aes/bsaes-x86_64.asm
index 6d75248d..9c6d1293 100644
--- a/win-x86_64/crypto/aes/bsaes-x86_64.asm
+++ b/win-x86_64/crypto/aes/bsaes-x86_64.asm
@@ -1319,7 +1319,7 @@ $L$cbc_dec_bzero:
cmp rbp,rax
ja NEAR $L$cbc_dec_bzero
- lea rsp,[rbp]
+ lea rax,[120+rbp]
movaps xmm6,XMMWORD[64+rbp]
movaps xmm7,XMMWORD[80+rbp]
movaps xmm8,XMMWORD[96+rbp]
@@ -1330,15 +1330,15 @@ $L$cbc_dec_bzero:
movaps xmm13,XMMWORD[176+rbp]
movaps xmm14,XMMWORD[192+rbp]
movaps xmm15,XMMWORD[208+rbp]
- lea rsp,[160+rbp]
- mov r15,QWORD[72+rsp]
- mov r14,QWORD[80+rsp]
- mov r13,QWORD[88+rsp]
- mov r12,QWORD[96+rsp]
- mov rbx,QWORD[104+rsp]
- mov rax,QWORD[112+rsp]
- lea rsp,[120+rsp]
- mov rbp,rax
+ lea rax,[160+rax]
+$L$cbc_dec_tail:
+ mov r15,QWORD[((-48))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbx,QWORD[((-16))+rax]
+ mov rbp,QWORD[((-8))+rax]
+ lea rsp,[rax]
$L$cbc_dec_epilogue:
DB 0F3h,0C3h ;repret
@@ -1543,7 +1543,7 @@ $L$ctr_enc_bzero:
cmp rbp,rax
ja NEAR $L$ctr_enc_bzero
- lea rsp,[rbp]
+ lea rax,[120+rbp]
movaps xmm6,XMMWORD[64+rbp]
movaps xmm7,XMMWORD[80+rbp]
movaps xmm8,XMMWORD[96+rbp]
@@ -1554,15 +1554,15 @@ $L$ctr_enc_bzero:
movaps xmm13,XMMWORD[176+rbp]
movaps xmm14,XMMWORD[192+rbp]
movaps xmm15,XMMWORD[208+rbp]
- lea rsp,[160+rbp]
- mov r15,QWORD[72+rsp]
- mov r14,QWORD[80+rsp]
- mov r13,QWORD[88+rsp]
- mov r12,QWORD[96+rsp]
- mov rbx,QWORD[104+rsp]
- mov rax,QWORD[112+rsp]
- lea rsp,[120+rsp]
- mov rbp,rax
+ lea rax,[160+rax]
+$L$ctr_enc_tail:
+ mov r15,QWORD[((-48))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbx,QWORD[((-16))+rax]
+ mov rbp,QWORD[((-8))+rax]
+ lea rsp,[rax]
$L$ctr_enc_epilogue:
DB 0F3h,0C3h ;repret
@@ -2019,7 +2019,7 @@ $L$xts_enc_bzero:
cmp rbp,rax
ja NEAR $L$xts_enc_bzero
- lea rsp,[rbp]
+ lea rax,[120+rbp]
movaps xmm6,XMMWORD[64+rbp]
movaps xmm7,XMMWORD[80+rbp]
movaps xmm8,XMMWORD[96+rbp]
@@ -2030,15 +2030,15 @@ $L$xts_enc_bzero:
movaps xmm13,XMMWORD[176+rbp]
movaps xmm14,XMMWORD[192+rbp]
movaps xmm15,XMMWORD[208+rbp]
- lea rsp,[160+rbp]
- mov r15,QWORD[72+rsp]
- mov r14,QWORD[80+rsp]
- mov r13,QWORD[88+rsp]
- mov r12,QWORD[96+rsp]
- mov rbx,QWORD[104+rsp]
- mov rax,QWORD[112+rsp]
- lea rsp,[120+rsp]
- mov rbp,rax
+ lea rax,[160+rax]
+$L$xts_enc_tail:
+ mov r15,QWORD[((-48))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbx,QWORD[((-16))+rax]
+ mov rbp,QWORD[((-8))+rax]
+ lea rsp,[rax]
$L$xts_enc_epilogue:
DB 0F3h,0C3h ;repret
@@ -2522,7 +2522,7 @@ $L$xts_dec_bzero:
cmp rbp,rax
ja NEAR $L$xts_dec_bzero
- lea rsp,[rbp]
+ lea rax,[120+rbp]
movaps xmm6,XMMWORD[64+rbp]
movaps xmm7,XMMWORD[80+rbp]
movaps xmm8,XMMWORD[96+rbp]
@@ -2533,15 +2533,15 @@ $L$xts_dec_bzero:
movaps xmm13,XMMWORD[176+rbp]
movaps xmm14,XMMWORD[192+rbp]
movaps xmm15,XMMWORD[208+rbp]
- lea rsp,[160+rbp]
- mov r15,QWORD[72+rsp]
- mov r14,QWORD[80+rsp]
- mov r13,QWORD[88+rsp]
- mov r12,QWORD[96+rsp]
- mov rbx,QWORD[104+rsp]
- mov rax,QWORD[112+rsp]
- lea rsp,[120+rsp]
- mov rbp,rax
+ lea rax,[160+rax]
+$L$xts_dec_tail:
+ mov r15,QWORD[((-48))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov rbx,QWORD[((-16))+rax]
+ mov rbp,QWORD[((-8))+rax]
+ lea rsp,[rax]
$L$xts_dec_epilogue:
DB 0F3h,0C3h ;repret
@@ -2628,30 +2628,33 @@ se_handler:
mov r10d,DWORD[r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
- jb NEAR $L$in_prologue
-
- mov rax,QWORD[152+r8]
+ jbe NEAR $L$in_prologue
mov r10d,DWORD[4+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$in_prologue
+ mov r10d,DWORD[8+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$in_tail
+
mov rax,QWORD[160+r8]
lea rsi,[64+rax]
lea rdi,[512+r8]
mov ecx,20
DD 0xa548f3fc
- lea rax,[160+rax]
-
- mov rbp,QWORD[112+rax]
- mov rbx,QWORD[104+rax]
- mov r12,QWORD[96+rax]
- mov r13,QWORD[88+rax]
- mov r14,QWORD[80+rax]
- mov r15,QWORD[72+rax]
- lea rax,[120+rax]
+ lea rax,[((160+120))+rax]
+
+$L$in_tail:
+ mov rbp,QWORD[((-48))+rax]
+ mov rbx,QWORD[((-40))+rax]
+ mov r12,QWORD[((-32))+rax]
+ mov r13,QWORD[((-24))+rax]
+ mov r14,QWORD[((-16))+rax]
+ mov r15,QWORD[((-8))+rax]
mov QWORD[144+r8],rbx
mov QWORD[160+r8],rbp
mov QWORD[216+r8],r12
@@ -2719,15 +2722,23 @@ $L$cbc_dec_info:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase
+ DD $L$cbc_dec_tail wrt ..imagebase
+ DD 0
$L$ctr_enc_info:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase
+ DD $L$ctr_enc_tail wrt ..imagebase
+ DD 0
$L$xts_enc_info:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase
+ DD $L$xts_enc_tail wrt ..imagebase
+ DD 0
$L$xts_dec_info:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase
+ DD $L$xts_dec_tail wrt ..imagebase
+ DD 0
diff --git a/win-x86_64/crypto/bn/x86_64-mont.asm b/win-x86_64/crypto/bn/x86_64-mont.asm
index 4d8e1cb7..1a9da512 100644
--- a/win-x86_64/crypto/bn/x86_64-mont.asm
+++ b/win-x86_64/crypto/bn/x86_64-mont.asm
@@ -23,6 +23,10 @@ $L$SEH_begin_bn_mul_mont:
mov r9,QWORD[48+rsp]
+
+ mov r9d,r9d
+ mov rax,rsp
+
test r9d,3
jnz NEAR $L$mul_enter
cmp r9d,8
@@ -36,20 +40,50 @@ $L$SEH_begin_bn_mul_mont:
ALIGN 16
$L$mul_enter:
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
- mov r9d,r9d
- lea r10,[2+r9]
+
+ neg r9
mov r11,rsp
- neg r10
- lea rsp,[r10*8+rsp]
- and rsp,-1024
+ lea r10,[((-16))+r9*8+rsp]
+ neg r9
+ and r10,-1024
+
+
+
+
+
+
+
+
+
+ sub r11,r10
+ and r11,-4096
+ lea rsp,[r11*1+r10]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+ jmp NEAR $L$mul_page_walk_done
+
+ALIGN 16
+$L$mul_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+ mov QWORD[8+r9*8+rsp],rax
- mov QWORD[8+r9*8+rsp],r11
$L$mul_body:
mov r12,rdx
mov r8,QWORD[r8]
@@ -201,33 +235,43 @@ $L$sub: sbb rax,QWORD[r14*8+rcx]
sbb rax,0
xor r14,r14
+ and rsi,rax
+ not rax
+ mov rcx,rdi
+ and rcx,rax
mov r15,r9
+ or rsi,rcx
ALIGN 16
$L$copy:
- mov rsi,QWORD[r14*8+rsp]
- mov rcx,QWORD[r14*8+rdi]
- xor rsi,rcx
- and rsi,rax
- xor rsi,rcx
+ mov rax,QWORD[r14*8+rsi]
mov QWORD[r14*8+rsp],r14
- mov QWORD[r14*8+rdi],rsi
+ mov QWORD[r14*8+rdi],rax
lea r14,[1+r14]
sub r15,1
jnz NEAR $L$copy
mov rsi,QWORD[8+r9*8+rsp]
+
mov rax,1
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
$L$mul_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_mul_mont:
ALIGN 16
@@ -244,22 +288,47 @@ $L$SEH_begin_bn_mul4x_mont:
mov r9,QWORD[48+rsp]
+
+ mov r9d,r9d
+ mov rax,rsp
+
$L$mul4x_enter:
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
- mov r9d,r9d
- lea r10,[4+r9]
+
+ neg r9
mov r11,rsp
- neg r10
- lea rsp,[r10*8+rsp]
- and rsp,-1024
+ lea r10,[((-32))+r9*8+rsp]
+ neg r9
+ and r10,-1024
+
+ sub r11,r10
+ and r11,-4096
+ lea rsp,[r11*1+r10]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul4x_page_walk
+ jmp NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+ mov QWORD[8+r9*8+rsp],rax
- mov QWORD[8+r9*8+rsp],r11
$L$mul4x_body:
mov QWORD[16+r9*8+rsp],rdi
mov r12,rdx
@@ -559,9 +628,11 @@ $L$inner4x:
cmp r14,r9
jb NEAR $L$outer4x
mov rdi,QWORD[16+r9*8+rsp]
+ lea r15,[((-4))+r9]
mov rax,QWORD[rsp]
+ pxor xmm0,xmm0
mov rdx,QWORD[8+rsp]
- shr r9,2
+ shr r15,2
lea rsi,[rsp]
xor r14,r14
@@ -569,7 +640,6 @@ $L$inner4x:
mov rbx,QWORD[16+rsi]
mov rbp,QWORD[24+rsi]
sbb rdx,QWORD[8+rcx]
- lea r15,[((-1))+r9]
jmp NEAR $L$sub4x
ALIGN 16
$L$sub4x:
@@ -597,49 +667,57 @@ $L$sub4x:
mov QWORD[16+r14*8+rdi],rbx
sbb rax,0
-DB 66h, 48h, 0fh, 6eh, 0c0h
- punpcklqdq xmm0,xmm0
mov QWORD[24+r14*8+rdi],rbp
xor r14,r14
-
- mov r15,r9
- pxor xmm5,xmm5
+ and rsi,rax
+ not rax
+ mov rcx,rdi
+ and rcx,rax
+ lea r15,[((-4))+r9]
+ or rsi,rcx
+ shr r15,2
+
+ movdqu xmm1,XMMWORD[rsi]
+ movdqa XMMWORD[rsp],xmm0
+ movdqu XMMWORD[rdi],xmm1
jmp NEAR $L$copy4x
ALIGN 16
$L$copy4x:
- movdqu xmm2,XMMWORD[r14*1+rsp]
- movdqu xmm4,XMMWORD[16+r14*1+rsp]
- movdqu xmm1,XMMWORD[r14*1+rdi]
- movdqu xmm3,XMMWORD[16+r14*1+rdi]
- pxor xmm2,xmm1
- pxor xmm4,xmm3
- pand xmm2,xmm0
- pand xmm4,xmm0
- pxor xmm2,xmm1
- pxor xmm4,xmm3
- movdqu XMMWORD[r14*1+rdi],xmm2
- movdqu XMMWORD[16+r14*1+rdi],xmm4
- movdqa XMMWORD[r14*1+rsp],xmm5
- movdqa XMMWORD[16+r14*1+rsp],xmm5
-
+ movdqu xmm2,XMMWORD[16+r14*1+rsi]
+ movdqu xmm1,XMMWORD[32+r14*1+rsi]
+ movdqa XMMWORD[16+r14*1+rsp],xmm0
+ movdqu XMMWORD[16+r14*1+rdi],xmm2
+ movdqa XMMWORD[32+r14*1+rsp],xmm0
+ movdqu XMMWORD[32+r14*1+rdi],xmm1
lea r14,[32+r14]
dec r15
jnz NEAR $L$copy4x
- shl r9,2
+ movdqu xmm2,XMMWORD[16+r14*1+rsi]
+ movdqa XMMWORD[16+r14*1+rsp],xmm0
+ movdqu XMMWORD[16+r14*1+rdi],xmm2
mov rsi,QWORD[8+r9*8+rsp]
+
mov rax,1
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+
+ mov r14,QWORD[((-40))+rsi]
+
+ mov r13,QWORD[((-32))+rsi]
+
+ mov r12,QWORD[((-24))+rsi]
+
+ mov rbp,QWORD[((-16))+rsi]
+
+ mov rbx,QWORD[((-8))+rsi]
+
+ lea rsp,[rsi]
+
$L$mul4x_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_mul4x_mont:
EXTERN bn_sqr8x_internal
@@ -658,15 +736,24 @@ $L$SEH_begin_bn_sqr8x_mont:
mov r9,QWORD[48+rsp]
-$L$sqr8x_enter:
+
mov rax,rsp
+
+$L$sqr8x_enter:
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
+$L$sqr8x_prologue:
+
mov r10d,r9d
shl r9d,3
shl r10,3+2
@@ -678,30 +765,49 @@ $L$sqr8x_enter:
lea r11,[((-64))+r9*2+rsp]
+ mov rbp,rsp
mov r8,QWORD[r8]
sub r11,rsi
and r11,4095
cmp r10,r11
jb NEAR $L$sqr8x_sp_alt
- sub rsp,r11
- lea rsp,[((-64))+r9*2+rsp]
+ sub rbp,r11
+ lea rbp,[((-64))+r9*2+rbp]
jmp NEAR $L$sqr8x_sp_done
ALIGN 32
$L$sqr8x_sp_alt:
lea r10,[((4096-64))+r9*2]
- lea rsp,[((-64))+r9*2+rsp]
+ lea rbp,[((-64))+r9*2+rbp]
sub r11,r10
mov r10,0
cmovc r11,r10
- sub rsp,r11
+ sub rbp,r11
$L$sqr8x_sp_done:
- and rsp,-64
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$sqr8x_page_walk
+ jmp NEAR $L$sqr8x_page_walk_done
+
+ALIGN 16
+$L$sqr8x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$sqr8x_page_walk
+$L$sqr8x_page_walk_done:
+
mov r10,r9
neg r9
mov QWORD[32+rsp],r8
mov QWORD[40+rsp],rax
+
$L$sqr8x_body:
DB 102,72,15,110,209
@@ -748,6 +854,7 @@ DB 102,72,15,110,200
pxor xmm0,xmm0
pshufd xmm1,xmm1,0
mov rsi,QWORD[40+rsp]
+
jmp NEAR $L$sqr8x_cond_copy
ALIGN 32
@@ -777,16 +884,24 @@ $L$sqr8x_cond_copy:
mov rax,1
mov r15,QWORD[((-48))+rsi]
+
mov r14,QWORD[((-40))+rsi]
+
mov r13,QWORD[((-32))+rsi]
+
mov r12,QWORD[((-24))+rsi]
+
mov rbp,QWORD[((-16))+rsi]
+
mov rbx,QWORD[((-8))+rsi]
+
lea rsp,[rsi]
+
$L$sqr8x_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_sqr8x_mont:
DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
@@ -829,22 +944,8 @@ mul_handler:
mov r10,QWORD[192+r8]
mov rax,QWORD[8+r10*8+rax]
- lea rax,[48+rax]
- mov rbx,QWORD[((-8))+rax]
- mov rbp,QWORD[((-16))+rax]
- mov r12,QWORD[((-24))+rax]
- mov r13,QWORD[((-32))+rax]
- mov r14,QWORD[((-40))+rax]
- mov r15,QWORD[((-48))+rax]
- mov QWORD[144+r8],rbx
- mov QWORD[160+r8],rbp
- mov QWORD[216+r8],r12
- mov QWORD[224+r8],r13
- mov QWORD[232+r8],r14
- mov QWORD[240+r8],r15
-
- jmp NEAR $L$common_seh_tail
+ jmp NEAR $L$common_pop_regs
@@ -872,15 +973,21 @@ sqr_handler:
cmp rbx,r10
jb NEAR $L$common_seh_tail
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_pop_regs
+
mov rax,QWORD[152+r8]
- mov r10d,DWORD[4+r11]
+ mov r10d,DWORD[8+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$common_seh_tail
mov rax,QWORD[40+rax]
+$L$common_pop_regs:
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
mov r12,QWORD[((-24))+rax]
@@ -960,4 +1067,5 @@ DB 9,0,0,0
$L$SEH_info_bn_sqr8x_mont:
DB 9,0,0,0
DD sqr_handler wrt ..imagebase
- DD $L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ALIGN 8
diff --git a/win-x86_64/crypto/bn/x86_64-mont5.asm b/win-x86_64/crypto/bn/x86_64-mont5.asm
index 58f19ac2..b3306410 100644
--- a/win-x86_64/crypto/bn/x86_64-mont5.asm
+++ b/win-x86_64/crypto/bn/x86_64-mont5.asm
@@ -23,30 +23,64 @@ $L$SEH_begin_bn_mul_mont_gather5:
mov r9,QWORD[48+rsp]
+
+ mov r9d,r9d
+ mov rax,rsp
+
test r9d,7
jnz NEAR $L$mul_enter
jmp NEAR $L$mul4x_enter
ALIGN 16
$L$mul_enter:
- mov r9d,r9d
- mov rax,rsp
movd xmm5,DWORD[56+rsp]
- lea r10,[$L$inc]
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
- lea r11,[2+r9]
- neg r11
- lea rsp,[((-264))+r11*8+rsp]
- and rsp,-1024
+ neg r9
+ mov r11,rsp
+ lea r10,[((-280))+r9*8+rsp]
+ neg r9
+ and r10,-1024
+
+
+
+
+
+
+
+
+
+ sub r11,r10
+ and r11,-4096
+ lea rsp,[r11*1+r10]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+ jmp NEAR $L$mul_page_walk_done
+
+$L$mul_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r11,QWORD[rsp]
+ cmp rsp,r10
+ ja NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+ lea r10,[$L$inc]
mov QWORD[8+r9*8+rsp],rax
+
$L$mul_body:
+
lea r12,[128+rdx]
movdqa xmm0,XMMWORD[r10]
movdqa xmm1,XMMWORD[16+r10]
@@ -385,34 +419,44 @@ $L$sub: sbb rax,QWORD[r14*8+rcx]
sbb rax,0
xor r14,r14
+ and rsi,rax
+ not rax
+ mov rcx,rdi
+ and rcx,rax
mov r15,r9
+ or rsi,rcx
ALIGN 16
$L$copy:
- mov rsi,QWORD[r14*8+rsp]
- mov rcx,QWORD[r14*8+rdi]
- xor rsi,rcx
- and rsi,rax
- xor rsi,rcx
+ mov rax,QWORD[r14*8+rsi]
mov QWORD[r14*8+rsp],r14
- mov QWORD[r14*8+rdi],rsi
+ mov QWORD[r14*8+rdi],rax
lea r14,[1+r14]
sub r15,1
jnz NEAR $L$copy
mov rsi,QWORD[8+r9*8+rsp]
+
mov rax,1
mov r15,QWORD[((-48))+rsi]
+
mov r14,QWORD[((-40))+rsi]
+
mov r13,QWORD[((-32))+rsi]
+
mov r12,QWORD[((-24))+rsi]
+
mov rbp,QWORD[((-16))+rsi]
+
mov rbx,QWORD[((-8))+rsi]
+
lea rsp,[rsi]
+
$L$mul_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_mul_mont_gather5:
ALIGN 32
@@ -429,16 +473,25 @@ $L$SEH_begin_bn_mul4x_mont_gather5:
mov r9,QWORD[48+rsp]
-$L$mul4x_enter:
+
DB 0x67
mov rax,rsp
+
+$L$mul4x_enter:
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
+$L$mul4x_prologue:
+
DB 0x67
shl r9d,3
lea r10,[r9*2+r9]
@@ -454,45 +507,72 @@ DB 0x67
lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
sub r11,rdi
and r11,4095
cmp r10,r11
jb NEAR $L$mul4xsp_alt
- sub rsp,r11
- lea rsp,[((-320))+r9*2+rsp]
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
jmp NEAR $L$mul4xsp_done
ALIGN 32
$L$mul4xsp_alt:
lea r10,[((4096-320))+r9*2]
- lea rsp,[((-320))+r9*2+rsp]
+ lea rbp,[((-320))+r9*2+rbp]
sub r11,r10
mov r10,0
cmovc r11,r10
- sub rsp,r11
+ sub rbp,r11
$L$mul4xsp_done:
- and rsp,-64
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mul4x_page_walk
+ jmp NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
neg r9
mov QWORD[40+rsp],rax
+
$L$mul4x_body:
call mul4x_internal
mov rsi,QWORD[40+rsp]
+
mov rax,1
mov r15,QWORD[((-48))+rsi]
+
mov r14,QWORD[((-40))+rsi]
+
mov r13,QWORD[((-32))+rsi]
+
mov r12,QWORD[((-24))+rsi]
+
mov rbp,QWORD[((-16))+rsi]
+
mov rbx,QWORD[((-8))+rsi]
+
lea rsp,[rsi]
+
$L$mul4x_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_mul4x_mont_gather5:
@@ -1036,14 +1116,23 @@ $L$SEH_begin_bn_power5:
mov r9,QWORD[48+rsp]
+
mov rax,rsp
+
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
+$L$power5_prologue:
+
shl r9d,3
lea r10d,[r9*2+r9]
neg r9
@@ -1057,24 +1146,41 @@ $L$SEH_begin_bn_power5:
lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
sub r11,rdi
and r11,4095
cmp r10,r11
jb NEAR $L$pwr_sp_alt
- sub rsp,r11
- lea rsp,[((-320))+r9*2+rsp]
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
jmp NEAR $L$pwr_sp_done
ALIGN 32
$L$pwr_sp_alt:
lea r10,[((4096-320))+r9*2]
- lea rsp,[((-320))+r9*2+rsp]
+ lea rbp,[((-320))+r9*2+rbp]
sub r11,r10
mov r10,0
cmovc r11,r10
- sub rsp,r11
+ sub rbp,r11
$L$pwr_sp_done:
- and rsp,-64
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$pwr_page_walk
+ jmp NEAR $L$pwr_page_walk_done
+
+$L$pwr_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$pwr_page_walk
+$L$pwr_page_walk_done:
+
mov r10,r9
neg r9
@@ -1089,6 +1195,7 @@ $L$pwr_sp_done:
mov QWORD[32+rsp],r8
mov QWORD[40+rsp],rax
+
$L$power5_body:
DB 102,72,15,110,207
DB 102,72,15,110,209
@@ -1115,18 +1222,27 @@ DB 102,72,15,126,226
call mul4x_internal
mov rsi,QWORD[40+rsp]
+
mov rax,1
mov r15,QWORD[((-48))+rsi]
+
mov r14,QWORD[((-40))+rsi]
+
mov r13,QWORD[((-32))+rsi]
+
mov r12,QWORD[((-24))+rsi]
+
mov rbp,QWORD[((-16))+rsi]
+
mov rbx,QWORD[((-8))+rsi]
+
lea rsp,[rsi]
+
$L$power5_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_power5:
global bn_sqr8x_internal
@@ -1989,15 +2105,24 @@ $L$SEH_begin_bn_from_mont8x:
mov r9,QWORD[48+rsp]
+
DB 0x67
mov rax,rsp
+
push rbx
+
push rbp
+
push r12
+
push r13
+
push r14
+
push r15
+$L$from_prologue:
+
shl r9d,3
lea r10,[r9*2+r9]
neg r9
@@ -2011,24 +2136,41 @@ DB 0x67
lea r11,[((-320))+r9*2+rsp]
+ mov rbp,rsp
sub r11,rdi
and r11,4095
cmp r10,r11
jb NEAR $L$from_sp_alt
- sub rsp,r11
- lea rsp,[((-320))+r9*2+rsp]
+ sub rbp,r11
+ lea rbp,[((-320))+r9*2+rbp]
jmp NEAR $L$from_sp_done
ALIGN 32
$L$from_sp_alt:
lea r10,[((4096-320))+r9*2]
- lea rsp,[((-320))+r9*2+rsp]
+ lea rbp,[((-320))+r9*2+rbp]
sub r11,r10
mov r10,0
cmovc r11,r10
- sub rsp,r11
+ sub rbp,r11
$L$from_sp_done:
- and rsp,-64
+ and rbp,-64
+ mov r11,rsp
+ sub r11,rbp
+ and r11,-4096
+ lea rsp,[rbp*1+r11]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$from_page_walk
+ jmp NEAR $L$from_page_walk_done
+
+$L$from_page_walk:
+ lea rsp,[((-4096))+rsp]
+ mov r10,QWORD[rsp]
+ cmp rsp,rbp
+ ja NEAR $L$from_page_walk
+$L$from_page_walk_done:
+
mov r10,r9
neg r9
@@ -2043,6 +2185,7 @@ $L$from_sp_done:
mov QWORD[32+rsp],r8
mov QWORD[40+rsp],rax
+
$L$from_body:
mov r11,r9
lea rax,[48+rsp]
@@ -2078,11 +2221,12 @@ DB 102,73,15,110,218
pxor xmm0,xmm0
lea rax,[48+rsp]
- mov rsi,QWORD[40+rsp]
jmp NEAR $L$from_mont_zero
ALIGN 32
$L$from_mont_zero:
+ mov rsi,QWORD[40+rsp]
+
movdqa XMMWORD[rax],xmm0
movdqa XMMWORD[16+rax],xmm0
movdqa XMMWORD[32+rax],xmm0
@@ -2093,16 +2237,24 @@ $L$from_mont_zero:
mov rax,1
mov r15,QWORD[((-48))+rsi]
+
mov r14,QWORD[((-40))+rsi]
+
mov r13,QWORD[((-32))+rsi]
+
mov r12,QWORD[((-24))+rsi]
+
mov rbp,QWORD[((-16))+rsi]
+
mov rbx,QWORD[((-8))+rsi]
+
lea rsp,[rsi]
+
$L$from_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
+
$L$SEH_end_bn_from_mont8x:
global bn_scatter5
@@ -2321,9 +2473,14 @@ mul_handler:
cmp rbx,r10
jb NEAR $L$common_seh_tail
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_pop_regs
+
mov rax,QWORD[152+r8]
- mov r10d,DWORD[4+r11]
+ mov r10d,DWORD[8+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$common_seh_tail
@@ -2335,11 +2492,11 @@ mul_handler:
mov r10,QWORD[192+r8]
mov rax,QWORD[8+r10*8+rax]
- jmp NEAR $L$body_proceed
+ jmp NEAR $L$common_pop_regs
$L$body_40:
mov rax,QWORD[40+rax]
-$L$body_proceed:
+$L$common_pop_regs:
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
mov r12,QWORD[((-24))+rax]
@@ -2419,22 +2576,22 @@ ALIGN 8
$L$SEH_info_bn_mul_mont_gather5:
DB 9,0,0,0
DD mul_handler wrt ..imagebase
- DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+ DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
ALIGN 8
$L$SEH_info_bn_mul4x_mont_gather5:
DB 9,0,0,0
DD mul_handler wrt ..imagebase
- DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+ DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
ALIGN 8
$L$SEH_info_bn_power5:
DB 9,0,0,0
DD mul_handler wrt ..imagebase
- DD $L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
+ DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
ALIGN 8
$L$SEH_info_bn_from_mont8x:
DB 9,0,0,0
DD mul_handler wrt ..imagebase
- DD $L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
+ DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
ALIGN 8
$L$SEH_info_bn_gather5:
DB 0x01,0x0b,0x03,0x0a
diff --git a/win-x86_64/crypto/chacha/chacha-x86_64.asm b/win-x86_64/crypto/chacha/chacha-x86_64.asm
index afebd2e0..cb362468 100644
--- a/win-x86_64/crypto/chacha/chacha-x86_64.asm
+++ b/win-x86_64/crypto/chacha/chacha-x86_64.asm
@@ -27,6 +27,15 @@ DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
$L$sigma:
DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
DB 0
+ALIGN 64
+$L$zeroz:
+ DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
+$L$fourz:
+ DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
+$L$incz:
+ DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+$L$sixteen:
+ DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
@@ -59,6 +68,7 @@ $L$SEH_begin_ChaCha20_ctr32:
push r14
push r15
sub rsp,64+24
+$L$ctr32_body:
movdqu xmm1,XMMWORD[rcx]
@@ -296,13 +306,14 @@ $L$oop_tail:
jnz NEAR $L$oop_tail
$L$done:
- add rsp,64+24
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
+ lea rsi,[((64+24+48))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$no_data:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -323,20 +334,15 @@ $L$SEH_begin_ChaCha20_ssse3:
$L$ChaCha20_ssse3:
+ mov r9,rsp
cmp rdx,128
ja NEAR $L$ChaCha20_4x
$L$do_sse3_after_all:
- push rbx
- push rbp
- push r12
- push r13
- push r14
- push r15
-
- sub rsp,64+72
- movaps XMMWORD[(64+32)+rsp],xmm6
- movaps XMMWORD[(64+48)+rsp],xmm7
+ sub rsp,64+40
+ movaps XMMWORD[(-40)+r9],xmm6
+ movaps XMMWORD[(-24)+r9],xmm7
+$L$ssse3_body:
movdqa xmm0,XMMWORD[$L$sigma]
movdqu xmm1,XMMWORD[rcx]
movdqu xmm2,XMMWORD[16+rcx]
@@ -348,7 +354,7 @@ $L$do_sse3_after_all:
movdqa XMMWORD[16+rsp],xmm1
movdqa XMMWORD[32+rsp],xmm2
movdqa XMMWORD[48+rsp],xmm3
- mov ebp,10
+ mov r8,10
jmp NEAR $L$oop_ssse3
ALIGN 32
@@ -358,7 +364,7 @@ $L$oop_outer_ssse3:
movdqa xmm1,XMMWORD[16+rsp]
movdqa xmm2,XMMWORD[32+rsp]
paddd xmm3,XMMWORD[48+rsp]
- mov ebp,10
+ mov r8,10
movdqa XMMWORD[48+rsp],xmm3
jmp NEAR $L$oop_ssse3
@@ -407,7 +413,7 @@ DB 102,15,56,0,223
pshufd xmm2,xmm2,78
pshufd xmm1,xmm1,147
pshufd xmm3,xmm3,57
- dec ebp
+ dec r8
jnz NEAR $L$oop_ssse3
paddd xmm0,XMMWORD[rsp]
paddd xmm1,XMMWORD[16+rsp]
@@ -444,27 +450,22 @@ $L$tail_ssse3:
movdqa XMMWORD[16+rsp],xmm1
movdqa XMMWORD[32+rsp],xmm2
movdqa XMMWORD[48+rsp],xmm3
- xor rbx,rbx
+ xor r8,r8
$L$oop_tail_ssse3:
- movzx eax,BYTE[rbx*1+rsi]
- movzx ecx,BYTE[rbx*1+rsp]
- lea rbx,[1+rbx]
+ movzx eax,BYTE[r8*1+rsi]
+ movzx ecx,BYTE[r8*1+rsp]
+ lea r8,[1+r8]
xor eax,ecx
- mov BYTE[((-1))+rbx*1+rdi],al
+ mov BYTE[((-1))+r8*1+rdi],al
dec rdx
jnz NEAR $L$oop_tail_ssse3
$L$done_ssse3:
- movaps xmm6,XMMWORD[((64+32))+rsp]
- movaps xmm7,XMMWORD[((64+48))+rsp]
- add rsp,64+72
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbp
- pop rbx
+ movaps xmm6,XMMWORD[((-40))+r9]
+ movaps xmm7,XMMWORD[((-24))+r9]
+ lea rsp,[r9]
+$L$ssse3_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
@@ -484,6 +485,7 @@ $L$SEH_begin_ChaCha20_4x:
$L$ChaCha20_4x:
+ mov r9,rsp
mov r11,r10
shr r10,32
test r10,32
@@ -496,18 +498,18 @@ $L$ChaCha20_4x:
je NEAR $L$do_sse3_after_all
$L$proceed4x:
- lea r11,[((-120))+rsp]
- sub rsp,0x148+160
- movaps XMMWORD[(-48)+r11],xmm6
- movaps XMMWORD[(-32)+r11],xmm7
- movaps XMMWORD[(-16)+r11],xmm8
- movaps XMMWORD[r11],xmm9
- movaps XMMWORD[16+r11],xmm10
- movaps XMMWORD[32+r11],xmm11
- movaps XMMWORD[48+r11],xmm12
- movaps XMMWORD[64+r11],xmm13
- movaps XMMWORD[80+r11],xmm14
- movaps XMMWORD[96+r11],xmm15
+ sub rsp,0x140+168
+ movaps XMMWORD[(-168)+r9],xmm6
+ movaps XMMWORD[(-152)+r9],xmm7
+ movaps XMMWORD[(-136)+r9],xmm8
+ movaps XMMWORD[(-120)+r9],xmm9
+ movaps XMMWORD[(-104)+r9],xmm10
+ movaps XMMWORD[(-88)+r9],xmm11
+ movaps XMMWORD[(-72)+r9],xmm12
+ movaps XMMWORD[(-56)+r9],xmm13
+ movaps XMMWORD[(-40)+r9],xmm14
+ movaps XMMWORD[(-24)+r9],xmm15
+$L$4x_body:
movdqa xmm11,XMMWORD[$L$sigma]
movdqu xmm15,XMMWORD[rcx]
movdqu xmm7,XMMWORD[16+rcx]
@@ -1034,18 +1036,18 @@ $L$oop_tail4x:
jnz NEAR $L$oop_tail4x
$L$done4x:
- lea r11,[((320+48))+rsp]
- movaps xmm6,XMMWORD[((-48))+r11]
- movaps xmm7,XMMWORD[((-32))+r11]
- movaps xmm8,XMMWORD[((-16))+r11]
- movaps xmm9,XMMWORD[r11]
- movaps xmm10,XMMWORD[16+r11]
- movaps xmm11,XMMWORD[32+r11]
- movaps xmm12,XMMWORD[48+r11]
- movaps xmm13,XMMWORD[64+r11]
- movaps xmm14,XMMWORD[80+r11]
- movaps xmm15,XMMWORD[96+r11]
- add rsp,0x148+160
+ movaps xmm6,XMMWORD[((-168))+r9]
+ movaps xmm7,XMMWORD[((-152))+r9]
+ movaps xmm8,XMMWORD[((-136))+r9]
+ movaps xmm9,XMMWORD[((-120))+r9]
+ movaps xmm10,XMMWORD[((-104))+r9]
+ movaps xmm11,XMMWORD[((-88))+r9]
+ movaps xmm12,XMMWORD[((-72))+r9]
+ movaps xmm13,XMMWORD[((-56))+r9]
+ movaps xmm14,XMMWORD[((-40))+r9]
+ movaps xmm15,XMMWORD[((-24))+r9]
+ lea rsp,[r9]
+$L$4x_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
@@ -1065,22 +1067,21 @@ $L$SEH_begin_ChaCha20_8x:
$L$ChaCha20_8x:
- mov r10,rsp
- sub rsp,0x280+176
+ mov r9,rsp
+ sub rsp,0x280+168
and rsp,-32
- lea r11,[((656+48))+rsp]
- movaps XMMWORD[(-48)+r11],xmm6
- movaps XMMWORD[(-32)+r11],xmm7
- movaps XMMWORD[(-16)+r11],xmm8
- movaps XMMWORD[r11],xmm9
- movaps XMMWORD[16+r11],xmm10
- movaps XMMWORD[32+r11],xmm11
- movaps XMMWORD[48+r11],xmm12
- movaps XMMWORD[64+r11],xmm13
- movaps XMMWORD[80+r11],xmm14
- movaps XMMWORD[96+r11],xmm15
+ movaps XMMWORD[(-168)+r9],xmm6
+ movaps XMMWORD[(-152)+r9],xmm7
+ movaps XMMWORD[(-136)+r9],xmm8
+ movaps XMMWORD[(-120)+r9],xmm9
+ movaps XMMWORD[(-104)+r9],xmm10
+ movaps XMMWORD[(-88)+r9],xmm11
+ movaps XMMWORD[(-72)+r9],xmm12
+ movaps XMMWORD[(-56)+r9],xmm13
+ movaps XMMWORD[(-40)+r9],xmm14
+ movaps XMMWORD[(-24)+r9],xmm15
+$L$8x_body:
vzeroupper
- mov QWORD[640+rsp],r10
@@ -1671,19 +1672,220 @@ $L$oop_tail8x:
$L$done8x:
vzeroall
- lea r11,[((656+48))+rsp]
- movaps xmm6,XMMWORD[((-48))+r11]
- movaps xmm7,XMMWORD[((-32))+r11]
- movaps xmm8,XMMWORD[((-16))+r11]
- movaps xmm9,XMMWORD[r11]
- movaps xmm10,XMMWORD[16+r11]
- movaps xmm11,XMMWORD[32+r11]
- movaps xmm12,XMMWORD[48+r11]
- movaps xmm13,XMMWORD[64+r11]
- movaps xmm14,XMMWORD[80+r11]
- movaps xmm15,XMMWORD[96+r11]
- mov rsp,QWORD[640+rsp]
+ movaps xmm6,XMMWORD[((-168))+r9]
+ movaps xmm7,XMMWORD[((-152))+r9]
+ movaps xmm8,XMMWORD[((-136))+r9]
+ movaps xmm9,XMMWORD[((-120))+r9]
+ movaps xmm10,XMMWORD[((-104))+r9]
+ movaps xmm11,XMMWORD[((-88))+r9]
+ movaps xmm12,XMMWORD[((-72))+r9]
+ movaps xmm13,XMMWORD[((-56))+r9]
+ movaps xmm14,XMMWORD[((-40))+r9]
+ movaps xmm15,XMMWORD[((-24))+r9]
+ lea rsp,[r9]
+$L$8x_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
DB 0F3h,0C3h ;repret
$L$SEH_end_ChaCha20_8x:
+EXTERN __imp_RtlVirtualUnwind
+
+ALIGN 16
+se_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ lea r10,[$L$ctr32_body]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[152+r8]
+
+ lea r10,[$L$no_data]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rax,[((64+24+48))+rax]
+
+ mov rbx,QWORD[((-8))+rax]
+ mov rbp,QWORD[((-16))+rax]
+ mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
+ mov QWORD[144+r8],rbx
+ mov QWORD[160+r8],rbp
+ mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
+
+$L$common_seh_tail:
+ mov rdi,QWORD[8+rax]
+ mov rsi,QWORD[16+rax]
+ mov QWORD[152+r8],rax
+ mov QWORD[168+r8],rsi
+ mov QWORD[176+r8],rdi
+
+ mov rdi,QWORD[40+r9]
+ mov rsi,r8
+ mov ecx,154
+ DD 0xa548f3fc
+
+ mov rsi,r9
+ xor rcx,rcx
+ mov rdx,QWORD[8+rsi]
+ mov r8,QWORD[rsi]
+ mov r9,QWORD[16+rsi]
+ mov r10,QWORD[40+rsi]
+ lea r11,[56+rsi]
+ lea r12,[24+rsi]
+ mov QWORD[32+rsp],r10
+ mov QWORD[40+rsp],r11
+ mov QWORD[48+rsp],r12
+ mov QWORD[56+rsp],rcx
+ call QWORD[__imp_RtlVirtualUnwind]
+
+ mov eax,1
+ add rsp,64
+ popfq
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ pop rdi
+ pop rsi
+ DB 0F3h,0C3h ;repret
+
+
+
+ALIGN 16
+ssse3_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[192+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-40))+rax]
+ lea rdi,[512+r8]
+ mov ecx,4
+ DD 0xa548f3fc
+
+ jmp NEAR $L$common_seh_tail
+
+
+
+ALIGN 16
+full_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ mov rsi,QWORD[8+r9]
+ mov r11,QWORD[56+r9]
+
+ mov r10d,DWORD[r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jb NEAR $L$common_seh_tail
+
+ mov rax,QWORD[192+r8]
+
+ mov r10d,DWORD[4+r11]
+ lea r10,[r10*1+rsi]
+ cmp rbx,r10
+ jae NEAR $L$common_seh_tail
+
+ lea rsi,[((-168))+rax]
+ lea rdi,[512+r8]
+ mov ecx,20
+ DD 0xa548f3fc
+
+ jmp NEAR $L$common_seh_tail
+
+
+section .pdata rdata align=4
+ALIGN 4
+ DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
+
+ DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
+
+ DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_4x wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_4x wrt ..imagebase
+ DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase
+ DD $L$SEH_end_ChaCha20_8x wrt ..imagebase
+ DD $L$SEH_info_ChaCha20_8x wrt ..imagebase
+section .xdata rdata align=8
+ALIGN 8
+$L$SEH_info_ChaCha20_ctr32:
+DB 9,0,0,0
+ DD se_handler wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ssse3:
+DB 9,0,0,0
+ DD ssse3_handler wrt ..imagebase
+ DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
+
+$L$SEH_info_ChaCha20_4x:
+DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
+$L$SEH_info_ChaCha20_8x:
+DB 9,0,0,0
+ DD full_handler wrt ..imagebase
+ DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
diff --git a/win-x86_64/crypto/modes/ghash-x86_64.asm b/win-x86_64/crypto/modes/ghash-x86_64.asm
index e5204bf8..b01f98c9 100644
--- a/win-x86_64/crypto/modes/ghash-x86_64.asm
+++ b/win-x86_64/crypto/modes/ghash-x86_64.asm
@@ -21,6 +21,10 @@ $L$SEH_begin_gcm_gmult_4bit:
push rbx
push rbp
push r12
+ push r13
+ push r14
+ push r15
+ sub rsp,280
$L$gmult_prologue:
movzx r8,BYTE[15+rdi]
@@ -97,8 +101,9 @@ $L$break1:
mov QWORD[8+rdi],r8
mov QWORD[rdi],r9
- mov rbx,QWORD[16+rsp]
- lea rsp,[24+rsp]
+ lea rsi,[((280+48))+rsp]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$gmult_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -669,14 +674,14 @@ $L$outer_loop:
mov QWORD[8+rdi],r8
mov QWORD[rdi],r9
- lea rsi,[280+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ lea rsi,[((280+48))+rsp]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$ghash_epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -1916,14 +1921,20 @@ se_handler:
cmp rbx,r10
jae NEAR $L$in_prologue
- lea rax,[24+rax]
+ lea rax,[((48+280))+rax]
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
mov r12,QWORD[((-24))+rax]
+ mov r13,QWORD[((-32))+rax]
+ mov r14,QWORD[((-40))+rax]
+ mov r15,QWORD[((-48))+rax]
mov QWORD[144+r8],rbx
mov QWORD[160+r8],rbp
mov QWORD[216+r8],r12
+ mov QWORD[224+r8],r13
+ mov QWORD[232+r8],r14
+ mov QWORD[240+r8],r15
$L$in_prologue:
mov rdi,QWORD[8+rax]
diff --git a/win-x86_64/crypto/sha/sha1-x86_64.asm b/win-x86_64/crypto/sha/sha1-x86_64.asm
index 168f78db..54845743 100644
--- a/win-x86_64/crypto/sha/sha1-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha1-x86_64.asm
@@ -1263,21 +1263,20 @@ $L$SEH_begin_sha1_block_data_order_ssse3:
_ssse3_shortcut:
- mov rax,rsp
+ mov r11,rsp
push rbx
push rbp
push r12
push r13
push r14
lea rsp,[((-160))+rsp]
- movaps XMMWORD[(-40-96)+rax],xmm6
- movaps XMMWORD[(-40-80)+rax],xmm7
- movaps XMMWORD[(-40-64)+rax],xmm8
- movaps XMMWORD[(-40-48)+rax],xmm9
- movaps XMMWORD[(-40-32)+rax],xmm10
- movaps XMMWORD[(-40-16)+rax],xmm11
+ movaps XMMWORD[(-40-96)+r11],xmm6
+ movaps XMMWORD[(-40-80)+r11],xmm7
+ movaps XMMWORD[(-40-64)+r11],xmm8
+ movaps XMMWORD[(-40-48)+r11],xmm9
+ movaps XMMWORD[(-40-32)+r11],xmm10
+ movaps XMMWORD[(-40-16)+r11],xmm11
$L$prologue_ssse3:
- mov r14,rax
and rsp,-64
mov r8,rdi
mov r9,rsi
@@ -1285,7 +1284,7 @@ $L$prologue_ssse3:
shl r10,6
add r10,r9
- lea r11,[((K_XX_XX+64))]
+ lea r14,[((K_XX_XX+64))]
mov eax,DWORD[r8]
mov ebx,DWORD[4+r8]
@@ -1297,8 +1296,8 @@ $L$prologue_ssse3:
xor edi,edx
and esi,edi
- movdqa xmm6,XMMWORD[64+r11]
- movdqa xmm9,XMMWORD[((-64))+r11]
+ movdqa xmm6,XMMWORD[64+r14]
+ movdqa xmm9,XMMWORD[((-64))+r14]
movdqu xmm0,XMMWORD[r9]
movdqu xmm1,XMMWORD[16+r9]
movdqu xmm2,XMMWORD[32+r9]
@@ -1374,7 +1373,7 @@ $L$oop_ssse3:
pslld xmm9,2
pxor xmm4,xmm10
xor edx,ebp
- movdqa xmm10,XMMWORD[((-64))+r11]
+ movdqa xmm10,XMMWORD[((-64))+r14]
rol ecx,5
add ebx,edi
and esi,edx
@@ -1435,7 +1434,7 @@ $L$oop_ssse3:
pslld xmm10,2
pxor xmm5,xmm8
xor ebp,eax
- movdqa xmm8,XMMWORD[((-32))+r11]
+ movdqa xmm8,XMMWORD[((-32))+r14]
rol edx,5
add ecx,edi
and esi,ebp
@@ -1496,7 +1495,7 @@ $L$oop_ssse3:
pslld xmm8,2
pxor xmm6,xmm9
xor eax,ebx
- movdqa xmm9,XMMWORD[((-32))+r11]
+ movdqa xmm9,XMMWORD[((-32))+r14]
rol ebp,5
add edx,edi
and esi,eax
@@ -1557,7 +1556,7 @@ $L$oop_ssse3:
pslld xmm9,2
pxor xmm7,xmm10
xor ebx,ecx
- movdqa xmm10,XMMWORD[((-32))+r11]
+ movdqa xmm10,XMMWORD[((-32))+r14]
rol eax,5
add ebp,edi
and esi,ebx
@@ -1668,7 +1667,7 @@ $L$oop_ssse3:
pxor xmm2,xmm3
add eax,esi
xor edi,edx
- movdqa xmm10,XMMWORD[r11]
+ movdqa xmm10,XMMWORD[r14]
ror ecx,7
paddd xmm9,xmm1
add eax,ebx
@@ -1903,7 +1902,7 @@ $L$oop_ssse3:
pxor xmm7,xmm0
rol ebx,5
add eax,esi
- movdqa xmm9,XMMWORD[32+r11]
+ movdqa xmm9,XMMWORD[32+r14]
xor edi,ecx
paddd xmm8,xmm6
xor ecx,edx
@@ -2194,8 +2193,8 @@ $L$oop_ssse3:
add ecx,edx
cmp r9,r10
je NEAR $L$done_ssse3
- movdqa xmm6,XMMWORD[64+r11]
- movdqa xmm9,XMMWORD[((-64))+r11]
+ movdqa xmm6,XMMWORD[64+r14]
+ movdqa xmm9,XMMWORD[((-64))+r14]
movdqu xmm0,XMMWORD[r9]
movdqu xmm1,XMMWORD[16+r9]
movdqu xmm2,XMMWORD[32+r9]
@@ -2432,19 +2431,18 @@ $L$done_ssse3:
mov DWORD[8+r8],ecx
mov DWORD[12+r8],edx
mov DWORD[16+r8],ebp
- movaps xmm6,XMMWORD[((-40-96))+r14]
- movaps xmm7,XMMWORD[((-40-80))+r14]
- movaps xmm8,XMMWORD[((-40-64))+r14]
- movaps xmm9,XMMWORD[((-40-48))+r14]
- movaps xmm10,XMMWORD[((-40-32))+r14]
- movaps xmm11,XMMWORD[((-40-16))+r14]
- lea rsi,[r14]
- mov r14,QWORD[((-40))+rsi]
- mov r13,QWORD[((-32))+rsi]
- mov r12,QWORD[((-24))+rsi]
- mov rbp,QWORD[((-16))+rsi]
- mov rbx,QWORD[((-8))+rsi]
- lea rsp,[rsi]
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+ mov r13,QWORD[((-32))+r11]
+ mov r12,QWORD[((-24))+r11]
+ mov rbp,QWORD[((-16))+r11]
+ mov rbx,QWORD[((-8))+r11]
+ lea rsp,[r11]
$L$epilogue_ssse3:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -2463,7 +2461,7 @@ $L$SEH_begin_sha1_block_data_order_avx:
_avx_shortcut:
- mov rax,rsp
+ mov r11,rsp
push rbx
push rbp
push r12
@@ -2471,14 +2469,13 @@ _avx_shortcut:
push r14
lea rsp,[((-160))+rsp]
vzeroupper
- vmovaps XMMWORD[(-40-96)+rax],xmm6
- vmovaps XMMWORD[(-40-80)+rax],xmm7
- vmovaps XMMWORD[(-40-64)+rax],xmm8
- vmovaps XMMWORD[(-40-48)+rax],xmm9
- vmovaps XMMWORD[(-40-32)+rax],xmm10
- vmovaps XMMWORD[(-40-16)+rax],xmm11
+ vmovaps XMMWORD[(-40-96)+r11],xmm6
+ vmovaps XMMWORD[(-40-80)+r11],xmm7
+ vmovaps XMMWORD[(-40-64)+r11],xmm8
+ vmovaps XMMWORD[(-40-48)+r11],xmm9
+ vmovaps XMMWORD[(-40-32)+r11],xmm10
+ vmovaps XMMWORD[(-40-16)+r11],xmm11
$L$prologue_avx:
- mov r14,rax
and rsp,-64
mov r8,rdi
mov r9,rsi
@@ -2486,7 +2483,7 @@ $L$prologue_avx:
shl r10,6
add r10,r9
- lea r11,[((K_XX_XX+64))]
+ lea r14,[((K_XX_XX+64))]
mov eax,DWORD[r8]
mov ebx,DWORD[4+r8]
@@ -2498,8 +2495,8 @@ $L$prologue_avx:
xor edi,edx
and esi,edi
- vmovdqa xmm6,XMMWORD[64+r11]
- vmovdqa xmm11,XMMWORD[((-64))+r11]
+ vmovdqa xmm6,XMMWORD[64+r14]
+ vmovdqa xmm11,XMMWORD[((-64))+r14]
vmovdqu xmm0,XMMWORD[r9]
vmovdqu xmm1,XMMWORD[16+r9]
vmovdqu xmm2,XMMWORD[32+r9]
@@ -2624,7 +2621,7 @@ $L$oop_avx:
vpxor xmm5,xmm5,xmm10
xor ebp,eax
shld edx,edx,5
- vmovdqa xmm11,XMMWORD[((-32))+r11]
+ vmovdqa xmm11,XMMWORD[((-32))+r14]
add ecx,edi
and esi,ebp
xor ebp,eax
@@ -2837,7 +2834,7 @@ $L$oop_avx:
add eax,esi
xor edi,edx
vpaddd xmm9,xmm11,xmm1
- vmovdqa xmm11,XMMWORD[r11]
+ vmovdqa xmm11,XMMWORD[r14]
shrd ecx,ecx,7
add eax,ebx
vpxor xmm2,xmm2,xmm8
@@ -3056,7 +3053,7 @@ $L$oop_avx:
mov edi,ebx
xor esi,edx
vpaddd xmm9,xmm11,xmm6
- vmovdqa xmm11,XMMWORD[32+r11]
+ vmovdqa xmm11,XMMWORD[32+r14]
shld ebx,ebx,5
add eax,esi
vpxor xmm7,xmm7,xmm8
@@ -3335,8 +3332,8 @@ $L$oop_avx:
add ecx,edx
cmp r9,r10
je NEAR $L$done_avx
- vmovdqa xmm6,XMMWORD[64+r11]
- vmovdqa xmm11,XMMWORD[((-64))+r11]
+ vmovdqa xmm6,XMMWORD[64+r14]
+ vmovdqa xmm11,XMMWORD[((-64))+r14]
vmovdqu xmm0,XMMWORD[r9]
vmovdqu xmm1,XMMWORD[16+r9]
vmovdqu xmm2,XMMWORD[32+r9]
@@ -3572,19 +3569,18 @@ $L$done_avx:
mov DWORD[8+r8],ecx
mov DWORD[12+r8],edx
mov DWORD[16+r8],ebp
- movaps xmm6,XMMWORD[((-40-96))+r14]
- movaps xmm7,XMMWORD[((-40-80))+r14]
- movaps xmm8,XMMWORD[((-40-64))+r14]
- movaps xmm9,XMMWORD[((-40-48))+r14]
- movaps xmm10,XMMWORD[((-40-32))+r14]
- movaps xmm11,XMMWORD[((-40-16))+r14]
- lea rsi,[r14]
- mov r14,QWORD[((-40))+rsi]
- mov r13,QWORD[((-32))+rsi]
- mov r12,QWORD[((-24))+rsi]
- mov rbp,QWORD[((-16))+rsi]
- mov rbx,QWORD[((-8))+rsi]
- lea rsp,[rsi]
+ movaps xmm6,XMMWORD[((-40-96))+r11]
+ movaps xmm7,XMMWORD[((-40-80))+r11]
+ movaps xmm8,XMMWORD[((-40-64))+r11]
+ movaps xmm9,XMMWORD[((-40-48))+r11]
+ movaps xmm10,XMMWORD[((-40-32))+r11]
+ movaps xmm11,XMMWORD[((-40-16))+r11]
+ mov r14,QWORD[((-40))+r11]
+ mov r13,QWORD[((-32))+r11]
+ mov r12,QWORD[((-24))+r11]
+ mov rbp,QWORD[((-16))+r11]
+ mov rbx,QWORD[((-8))+r11]
+ lea rsp,[r11]
$L$epilogue_avx:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -3677,15 +3673,13 @@ ssse3_handler:
cmp rbx,r10
jb NEAR $L$common_seh_tail
- mov rax,QWORD[152+r8]
+ mov rax,QWORD[208+r8]
mov r10d,DWORD[4+r11]
lea r10,[r10*1+rsi]
cmp rbx,r10
jae NEAR $L$common_seh_tail
- mov rax,QWORD[232+r8]
-
lea rsi,[((-40-96))+rax]
lea rdi,[512+r8]
mov ecx,12
diff --git a/win-x86_64/crypto/sha/sha256-x86_64.asm b/win-x86_64/crypto/sha/sha256-x86_64.asm
index efaf9b55..6e3d1541 100644
--- a/win-x86_64/crypto/sha/sha256-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha256-x86_64.asm
@@ -30,13 +30,13 @@ $L$SEH_begin_sha256_block_data_order:
je NEAR $L$avx_shortcut
test r10d,512
jnz NEAR $L$ssse3_shortcut
+ mov rax,rsp
push rbx
push rbp
push r12
push r13
push r14
push r15
- mov r11,rsp
shl rdx,4
sub rsp,16*4+4*8
lea rdx,[rdx*4+rsi]
@@ -44,7 +44,7 @@ $L$SEH_begin_sha256_block_data_order:
mov QWORD[((64+0))+rsp],rdi
mov QWORD[((64+8))+rsp],rsi
mov QWORD[((64+16))+rsp],rdx
- mov QWORD[((64+24))+rsp],r11
+ mov QWORD[((64+24))+rsp],rax
$L$prologue:
mov eax,DWORD[rdi]
@@ -1709,13 +1709,13 @@ $L$rounds_16_xx:
jb NEAR $L$loop
mov rsi,QWORD[((64+24))+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -1781,13 +1781,13 @@ $L$SEH_begin_sha256_block_data_order_ssse3:
$L$ssse3_shortcut:
+ mov rax,rsp
push rbx
push rbp
push r12
push r13
push r14
push r15
- mov r11,rsp
shl rdx,4
sub rsp,160
lea rdx,[rdx*4+rsi]
@@ -1795,7 +1795,7 @@ $L$ssse3_shortcut:
mov QWORD[((64+0))+rsp],rdi
mov QWORD[((64+8))+rsp],rsi
mov QWORD[((64+16))+rsp],rdx
- mov QWORD[((64+24))+rsp],r11
+ mov QWORD[((64+24))+rsp],rax
movaps XMMWORD[(64+32)+rsp],xmm6
movaps XMMWORD[(64+48)+rsp],xmm7
movaps XMMWORD[(64+64)+rsp],xmm8
@@ -2870,13 +2870,13 @@ DB 102,15,58,15,249,4
movaps xmm7,XMMWORD[((64+48))+rsp]
movaps xmm8,XMMWORD[((64+64))+rsp]
movaps xmm9,XMMWORD[((64+80))+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$epilogue_ssse3:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -2895,13 +2895,13 @@ $L$SEH_begin_sha256_block_data_order_avx:
$L$avx_shortcut:
+ mov rax,rsp
push rbx
push rbp
push r12
push r13
push r14
push r15
- mov r11,rsp
shl rdx,4
sub rsp,160
lea rdx,[rdx*4+rsi]
@@ -2909,7 +2909,7 @@ $L$avx_shortcut:
mov QWORD[((64+0))+rsp],rdi
mov QWORD[((64+8))+rsp],rsi
mov QWORD[((64+16))+rsp],rdx
- mov QWORD[((64+24))+rsp],r11
+ mov QWORD[((64+24))+rsp],rax
movaps XMMWORD[(64+32)+rsp],xmm6
movaps XMMWORD[(64+48)+rsp],xmm7
movaps XMMWORD[(64+64)+rsp],xmm8
@@ -3946,13 +3946,13 @@ $L$avx_00_47:
movaps xmm7,XMMWORD[((64+48))+rsp]
movaps xmm8,XMMWORD[((64+64))+rsp]
movaps xmm9,XMMWORD[((64+80))+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$epilogue_avx:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -3992,7 +3992,6 @@ se_handler:
jae NEAR $L$in_prologue
mov rsi,rax
mov rax,QWORD[((64+24))+rax]
- lea rax,[48+rax]
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]
diff --git a/win-x86_64/crypto/sha/sha512-x86_64.asm b/win-x86_64/crypto/sha/sha512-x86_64.asm
index 71449cd2..d0d7a43f 100644
--- a/win-x86_64/crypto/sha/sha512-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha512-x86_64.asm
@@ -30,13 +30,13 @@ $L$SEH_begin_sha512_block_data_order:
or r10d,r9d
cmp r10d,1342177792
je NEAR $L$avx_shortcut
+ mov rax,rsp
push rbx
push rbp
push r12
push r13
push r14
push r15
- mov r11,rsp
shl rdx,4
sub rsp,16*8+4*8
lea rdx,[rdx*8+rsi]
@@ -44,7 +44,7 @@ $L$SEH_begin_sha512_block_data_order:
mov QWORD[((128+0))+rsp],rdi
mov QWORD[((128+8))+rsp],rsi
mov QWORD[((128+16))+rsp],rdx
- mov QWORD[((128+24))+rsp],r11
+ mov QWORD[((128+24))+rsp],rax
$L$prologue:
mov rax,QWORD[rdi]
@@ -1709,13 +1709,13 @@ $L$rounds_16_xx:
jb NEAR $L$loop
mov rsi,QWORD[((128+24))+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$epilogue:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -1825,13 +1825,13 @@ $L$SEH_begin_sha512_block_data_order_xop:
$L$xop_shortcut:
+ mov rax,rsp
push rbx
push rbp
push r12
push r13
push r14
push r15
- mov r11,rsp
shl rdx,4
sub rsp,256
lea rdx,[rdx*8+rsi]
@@ -1839,7 +1839,7 @@ $L$xop_shortcut:
mov QWORD[((128+0))+rsp],rdi
mov QWORD[((128+8))+rsp],rsi
mov QWORD[((128+16))+rsp],rdx
- mov QWORD[((128+24))+rsp],r11
+ mov QWORD[((128+24))+rsp],rax
movaps XMMWORD[(128+32)+rsp],xmm6
movaps XMMWORD[(128+48)+rsp],xmm7
movaps XMMWORD[(128+64)+rsp],xmm8
@@ -2906,13 +2906,13 @@ DB 143,72,120,195,203,42
movaps xmm9,XMMWORD[((128+80))+rsp]
movaps xmm10,XMMWORD[((128+96))+rsp]
movaps xmm11,XMMWORD[((128+112))+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$epilogue_xop:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -2931,13 +2931,13 @@ $L$SEH_begin_sha512_block_data_order_avx:
$L$avx_shortcut:
+ mov rax,rsp
push rbx
push rbp
push r12
push r13
push r14
push r15
- mov r11,rsp
shl rdx,4
sub rsp,256
lea rdx,[rdx*8+rsi]
@@ -2945,7 +2945,7 @@ $L$avx_shortcut:
mov QWORD[((128+0))+rsp],rdi
mov QWORD[((128+8))+rsp],rsi
mov QWORD[((128+16))+rsp],rdx
- mov QWORD[((128+24))+rsp],r11
+ mov QWORD[((128+24))+rsp],rax
movaps XMMWORD[(128+32)+rsp],xmm6
movaps XMMWORD[(128+48)+rsp],xmm7
movaps XMMWORD[(128+64)+rsp],xmm8
@@ -4076,13 +4076,13 @@ $L$avx_00_47:
movaps xmm9,XMMWORD[((128+80))+rsp]
movaps xmm10,XMMWORD[((128+96))+rsp]
movaps xmm11,XMMWORD[((128+112))+rsp]
- mov r15,QWORD[rsi]
- mov r14,QWORD[8+rsi]
- mov r13,QWORD[16+rsi]
- mov r12,QWORD[24+rsi]
- mov rbp,QWORD[32+rsi]
- mov rbx,QWORD[40+rsi]
- lea rsp,[48+rsi]
+ mov r15,QWORD[((-48))+rsi]
+ mov r14,QWORD[((-40))+rsi]
+ mov r13,QWORD[((-32))+rsi]
+ mov r12,QWORD[((-24))+rsi]
+ mov rbp,QWORD[((-16))+rsi]
+ mov rbx,QWORD[((-8))+rsi]
+ lea rsp,[rsi]
$L$epilogue_avx:
mov rdi,QWORD[8+rsp] ;WIN64 epilogue
mov rsi,QWORD[16+rsp]
@@ -4122,7 +4122,6 @@ se_handler:
jae NEAR $L$in_prologue
mov rsi,rax
mov rax,QWORD[((128+24))+rax]
- lea rax,[48+rax]
mov rbx,QWORD[((-8))+rax]
mov rbp,QWORD[((-16))+rax]