summaryrefslogtreecommitdiff
path: root/linux-x86_64
diff options
context:
space:
mode:
authorAdam Vartanian <flooey@google.com>2018-08-10 14:55:24 +0100
committerAdam Vartanian <flooey@google.com>2018-08-10 15:32:39 +0100
commitbfcf3a72c0bcb62cfde80e932db0668a7f96c0f8 (patch)
tree3ab110046e45c25f8f69df137f8a9d4a52bf60b5 /linux-x86_64
parent6120b2b9dfbe5b19fe2d0f69a4b8418ad5f7fe08 (diff)
downloadboringssl-bfcf3a72c0bcb62cfde80e932db0668a7f96c0f8.tar.gz
external/boringssl: Sync to 8625ec4b436ccb4098ed4aac10891eff8372be41.
This includes the following changes: https://boringssl.googlesource.com/boringssl/+log/c596415ec62b501523d80f9afa26b135406da6bf..8625ec4b436ccb4098ed4aac10891eff8372be41 Test: cts -m CtsLibcoreTestCases Change-Id: I47a45e6b6f46b19fcbcb6c917895867d56dcd2ca
Diffstat (limited to 'linux-x86_64')
-rw-r--r--linux-x86_64/crypto/fipsmodule/aes-x86_64.S56
-rw-r--r--linux-x86_64/crypto/fipsmodule/aesni-x86_64.S130
-rw-r--r--linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S34
-rw-r--r--linux-x86_64/crypto/fipsmodule/ghash-x86_64.S41
-rw-r--r--linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S1914
-rw-r--r--linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S4
-rw-r--r--linux-x86_64/crypto/fipsmodule/rsaz-avx2.S5
-rw-r--r--linux-x86_64/crypto/fipsmodule/sha1-x86_64.S44
-rw-r--r--linux-x86_64/crypto/fipsmodule/sha256-x86_64.S66
-rw-r--r--linux-x86_64/crypto/fipsmodule/sha512-x86_64.S66
-rw-r--r--linux-x86_64/crypto/fipsmodule/x86_64-mont.S454
-rw-r--r--linux-x86_64/crypto/fipsmodule/x86_64-mont5.S1380
12 files changed, 4028 insertions, 166 deletions
diff --git a/linux-x86_64/crypto/fipsmodule/aes-x86_64.S b/linux-x86_64/crypto/fipsmodule/aes-x86_64.S
index 0928c170..65bcbf48 100644
--- a/linux-x86_64/crypto/fipsmodule/aes-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/aes-x86_64.S
@@ -327,11 +327,11 @@ _x86_64_AES_encrypt_compact:
.byte 0xf3,0xc3
.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
.align 16
-.globl asm_AES_encrypt
-.hidden asm_AES_encrypt
-.type asm_AES_encrypt,@function
-.hidden asm_AES_encrypt
-asm_AES_encrypt:
+.globl aes_nohw_encrypt
+.hidden aes_nohw_encrypt
+.type aes_nohw_encrypt,@function
+.hidden aes_nohw_encrypt
+aes_nohw_encrypt:
.cfi_startproc
movq %rsp,%rax
.cfi_def_cfa_register %rax
@@ -409,7 +409,7 @@ asm_AES_encrypt:
.Lenc_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size asm_AES_encrypt,.-asm_AES_encrypt
+.size aes_nohw_encrypt,.-aes_nohw_encrypt
.type _x86_64_AES_decrypt,@function
.align 16
_x86_64_AES_decrypt:
@@ -791,11 +791,11 @@ _x86_64_AES_decrypt_compact:
.byte 0xf3,0xc3
.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
.align 16
-.globl asm_AES_decrypt
-.hidden asm_AES_decrypt
-.type asm_AES_decrypt,@function
-.hidden asm_AES_decrypt
-asm_AES_decrypt:
+.globl aes_nohw_decrypt
+.hidden aes_nohw_decrypt
+.type aes_nohw_decrypt,@function
+.hidden aes_nohw_decrypt
+aes_nohw_decrypt:
.cfi_startproc
movq %rsp,%rax
.cfi_def_cfa_register %rax
@@ -875,12 +875,12 @@ asm_AES_decrypt:
.Ldec_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size asm_AES_decrypt,.-asm_AES_decrypt
+.size aes_nohw_decrypt,.-aes_nohw_decrypt
.align 16
-.globl asm_AES_set_encrypt_key
-.hidden asm_AES_set_encrypt_key
-.type asm_AES_set_encrypt_key,@function
-asm_AES_set_encrypt_key:
+.globl aes_nohw_set_encrypt_key
+.hidden aes_nohw_set_encrypt_key
+.type aes_nohw_set_encrypt_key,@function
+aes_nohw_set_encrypt_key:
.cfi_startproc
pushq %rbx
.cfi_adjust_cfa_offset 8
@@ -915,7 +915,7 @@ asm_AES_set_encrypt_key:
.Lenc_key_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key
+.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key
.type _x86_64_AES_set_encrypt_key,@function
.align 16
@@ -1157,10 +1157,10 @@ _x86_64_AES_set_encrypt_key:
.byte 0xf3,0xc3
.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
.align 16
-.globl asm_AES_set_decrypt_key
-.hidden asm_AES_set_decrypt_key
-.type asm_AES_set_decrypt_key,@function
-asm_AES_set_decrypt_key:
+.globl aes_nohw_set_decrypt_key
+.hidden aes_nohw_set_decrypt_key
+.type aes_nohw_set_decrypt_key,@function
+aes_nohw_set_decrypt_key:
.cfi_startproc
pushq %rbx
.cfi_adjust_cfa_offset 8
@@ -1365,15 +1365,15 @@ asm_AES_set_decrypt_key:
.Ldec_key_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key
+.size aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key
.align 16
-.globl asm_AES_cbc_encrypt
-.hidden asm_AES_cbc_encrypt
-.type asm_AES_cbc_encrypt,@function
+.globl aes_nohw_cbc_encrypt
+.hidden aes_nohw_cbc_encrypt
+.type aes_nohw_cbc_encrypt,@function
.extern OPENSSL_ia32cap_P
.hidden OPENSSL_ia32cap_P
-.hidden asm_AES_cbc_encrypt
-asm_AES_cbc_encrypt:
+.hidden aes_nohw_cbc_encrypt
+aes_nohw_cbc_encrypt:
.cfi_startproc
cmpq $0,%rdx
je .Lcbc_epilogue
@@ -1850,7 +1850,7 @@ asm_AES_cbc_encrypt:
.Lcbc_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size asm_AES_cbc_encrypt,.-asm_AES_cbc_encrypt
+.size aes_nohw_cbc_encrypt,.-aes_nohw_cbc_encrypt
.align 64
.LAES_Te:
.long 0xa56363c6,0xa56363c6
diff --git a/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S b/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S
index 53d594ec..fabf92be 100644
--- a/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S
@@ -2,11 +2,11 @@
.text
.extern OPENSSL_ia32cap_P
.hidden OPENSSL_ia32cap_P
-.globl aesni_encrypt
-.hidden aesni_encrypt
-.type aesni_encrypt,@function
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,@function
.align 16
-aesni_encrypt:
+aes_hw_encrypt:
movups (%rdi),%xmm2
movl 240(%rdx),%eax
movups (%rdx),%xmm0
@@ -25,13 +25,13 @@ aesni_encrypt:
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
-.size aesni_encrypt,.-aesni_encrypt
+.size aes_hw_encrypt,.-aes_hw_encrypt
-.globl aesni_decrypt
-.hidden aesni_decrypt
-.type aesni_decrypt,@function
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,@function
.align 16
-aesni_decrypt:
+aes_hw_decrypt:
movups (%rdi),%xmm2
movl 240(%rdx),%eax
movups (%rdx),%xmm0
@@ -50,7 +50,7 @@ aesni_decrypt:
movups %xmm2,(%rsi)
pxor %xmm2,%xmm2
.byte 0xf3,0xc3
-.size aesni_decrypt, .-aesni_decrypt
+.size aes_hw_decrypt, .-aes_hw_decrypt
.type _aesni_encrypt2,@function
.align 16
_aesni_encrypt2:
@@ -493,11 +493,11 @@ _aesni_decrypt8:
.byte 102,68,15,56,223,200
.byte 0xf3,0xc3
.size _aesni_decrypt8,.-_aesni_decrypt8
-.globl aesni_ecb_encrypt
-.hidden aesni_ecb_encrypt
-.type aesni_ecb_encrypt,@function
+.globl aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type aes_hw_ecb_encrypt,@function
.align 16
-aesni_ecb_encrypt:
+aes_hw_ecb_encrypt:
andq $-16,%rdx
jz .Lecb_ret
@@ -835,12 +835,12 @@ aesni_ecb_encrypt:
xorps %xmm0,%xmm0
pxor %xmm1,%xmm1
.byte 0xf3,0xc3
-.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
-.globl aesni_ccm64_encrypt_blocks
-.hidden aesni_ccm64_encrypt_blocks
-.type aesni_ccm64_encrypt_blocks,@function
+.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt
+.globl aes_hw_ccm64_encrypt_blocks
+.hidden aes_hw_ccm64_encrypt_blocks
+.type aes_hw_ccm64_encrypt_blocks,@function
.align 16
-aesni_ccm64_encrypt_blocks:
+aes_hw_ccm64_encrypt_blocks:
movl 240(%rcx),%eax
movdqu (%r8),%xmm6
movdqa .Lincrement64(%rip),%xmm9
@@ -899,12 +899,12 @@ aesni_ccm64_encrypt_blocks:
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
-.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
-.globl aesni_ccm64_decrypt_blocks
-.hidden aesni_ccm64_decrypt_blocks
-.type aesni_ccm64_decrypt_blocks,@function
+.size aes_hw_ccm64_encrypt_blocks,.-aes_hw_ccm64_encrypt_blocks
+.globl aes_hw_ccm64_decrypt_blocks
+.hidden aes_hw_ccm64_decrypt_blocks
+.type aes_hw_ccm64_decrypt_blocks,@function
.align 16
-aesni_ccm64_decrypt_blocks:
+aes_hw_ccm64_decrypt_blocks:
movl 240(%rcx),%eax
movups (%r8),%xmm6
movdqu (%r9),%xmm3
@@ -997,12 +997,12 @@ aesni_ccm64_decrypt_blocks:
pxor %xmm8,%xmm8
pxor %xmm6,%xmm6
.byte 0xf3,0xc3
-.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
-.globl aesni_ctr32_encrypt_blocks
-.hidden aesni_ctr32_encrypt_blocks
-.type aesni_ctr32_encrypt_blocks,@function
+.size aes_hw_ccm64_decrypt_blocks,.-aes_hw_ccm64_decrypt_blocks
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,@function
.align 16
-aesni_ctr32_encrypt_blocks:
+aes_hw_ctr32_encrypt_blocks:
.cfi_startproc
cmpq $1,%rdx
jne .Lctr32_bulk
@@ -1577,12 +1577,12 @@ aesni_ctr32_encrypt_blocks:
.Lctr32_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
-.globl aesni_xts_encrypt
-.hidden aesni_xts_encrypt
-.type aesni_xts_encrypt,@function
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl aes_hw_xts_encrypt
+.hidden aes_hw_xts_encrypt
+.type aes_hw_xts_encrypt,@function
.align 16
-aesni_xts_encrypt:
+aes_hw_xts_encrypt:
.cfi_startproc
leaq (%rsp),%r11
.cfi_def_cfa_register %r11
@@ -2048,12 +2048,12 @@ aesni_xts_encrypt:
.Lxts_enc_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_xts_encrypt,.-aesni_xts_encrypt
-.globl aesni_xts_decrypt
-.hidden aesni_xts_decrypt
-.type aesni_xts_decrypt,@function
+.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt
+.globl aes_hw_xts_decrypt
+.hidden aes_hw_xts_decrypt
+.type aes_hw_xts_decrypt,@function
.align 16
-aesni_xts_decrypt:
+aes_hw_xts_decrypt:
.cfi_startproc
leaq (%rsp),%r11
.cfi_def_cfa_register %r11
@@ -2556,12 +2556,12 @@ aesni_xts_decrypt:
.Lxts_dec_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_xts_decrypt,.-aesni_xts_decrypt
-.globl aesni_ocb_encrypt
-.hidden aesni_ocb_encrypt
-.type aesni_ocb_encrypt,@function
+.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt
+.globl aes_hw_ocb_encrypt
+.hidden aes_hw_ocb_encrypt
+.type aes_hw_ocb_encrypt,@function
.align 32
-aesni_ocb_encrypt:
+aes_hw_ocb_encrypt:
.cfi_startproc
leaq (%rsp),%rax
pushq %rbx
@@ -2771,7 +2771,7 @@ aesni_ocb_encrypt:
.Locb_enc_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+.size aes_hw_ocb_encrypt,.-aes_hw_ocb_encrypt
.type __ocb_encrypt6,@function
.align 32
@@ -2979,11 +2979,11 @@ __ocb_encrypt1:
.byte 0xf3,0xc3
.size __ocb_encrypt1,.-__ocb_encrypt1
-.globl aesni_ocb_decrypt
-.hidden aesni_ocb_decrypt
-.type aesni_ocb_decrypt,@function
+.globl aes_hw_ocb_decrypt
+.hidden aes_hw_ocb_decrypt
+.type aes_hw_ocb_decrypt,@function
.align 32
-aesni_ocb_decrypt:
+aes_hw_ocb_decrypt:
.cfi_startproc
leaq (%rsp),%rax
pushq %rbx
@@ -3215,7 +3215,7 @@ aesni_ocb_decrypt:
.Locb_dec_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+.size aes_hw_ocb_decrypt,.-aes_hw_ocb_decrypt
.type __ocb_decrypt6,@function
.align 32
@@ -3411,11 +3411,11 @@ __ocb_decrypt1:
.byte 102,15,56,223,215
.byte 0xf3,0xc3
.size __ocb_decrypt1,.-__ocb_decrypt1
-.globl aesni_cbc_encrypt
-.hidden aesni_cbc_encrypt
-.type aesni_cbc_encrypt,@function
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,@function
.align 16
-aesni_cbc_encrypt:
+aes_hw_cbc_encrypt:
.cfi_startproc
testq %rdx,%rdx
jz .Lcbc_ret
@@ -4003,12 +4003,12 @@ aesni_cbc_encrypt:
.Lcbc_ret:
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_cbc_encrypt,.-aesni_cbc_encrypt
-.globl aesni_set_decrypt_key
-.hidden aesni_set_decrypt_key
-.type aesni_set_decrypt_key,@function
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,@function
.align 16
-aesni_set_decrypt_key:
+aes_hw_set_decrypt_key:
.cfi_startproc
.byte 0x48,0x83,0xEC,0x08
.cfi_adjust_cfa_offset 8
@@ -4048,12 +4048,12 @@ aesni_set_decrypt_key:
.byte 0xf3,0xc3
.cfi_endproc
.LSEH_end_set_decrypt_key:
-.size aesni_set_decrypt_key,.-aesni_set_decrypt_key
-.globl aesni_set_encrypt_key
-.hidden aesni_set_encrypt_key
-.type aesni_set_encrypt_key,@function
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,@function
.align 16
-aesni_set_encrypt_key:
+aes_hw_set_encrypt_key:
__aesni_set_encrypt_key:
.cfi_startproc
.byte 0x48,0x83,0xEC,0x08
@@ -4424,7 +4424,7 @@ __aesni_set_encrypt_key:
shufps $170,%xmm1,%xmm1
xorps %xmm1,%xmm2
.byte 0xf3,0xc3
-.size aesni_set_encrypt_key,.-aesni_set_encrypt_key
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
.align 64
.Lbswap_mask:
diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index 7c293efe..eb5c8cbf 100644
--- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -1,10 +1,10 @@
#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
.text
-.extern asm_AES_encrypt
-.hidden asm_AES_encrypt
-.extern asm_AES_decrypt
-.hidden asm_AES_decrypt
+.extern aes_nohw_encrypt
+.hidden aes_nohw_encrypt
+.extern aes_nohw_decrypt
+.hidden aes_nohw_decrypt
.type _bsaes_encrypt8,@function
.align 64
@@ -1065,8 +1065,8 @@ _bsaes_key_convert:
.byte 0xf3,0xc3
.size _bsaes_key_convert,.-_bsaes_key_convert
-.extern asm_AES_cbc_encrypt
-.hidden asm_AES_cbc_encrypt
+.extern aes_nohw_cbc_encrypt
+.hidden aes_nohw_cbc_encrypt
.globl bsaes_cbc_encrypt
.hidden bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,@function
@@ -1074,9 +1074,9 @@ _bsaes_key_convert:
bsaes_cbc_encrypt:
.cfi_startproc
cmpl $0,%r9d
- jne asm_AES_cbc_encrypt
+ jne aes_nohw_cbc_encrypt
cmpq $128,%rdx
- jb asm_AES_cbc_encrypt
+ jb aes_nohw_cbc_encrypt
movq %rsp,%rax
.Lcbc_dec_prologue:
@@ -1304,7 +1304,7 @@ bsaes_cbc_encrypt:
leaq (%r12),%rdi
leaq 32(%rbp),%rsi
leaq (%r15),%rdx
- call asm_AES_decrypt
+ call aes_nohw_decrypt
pxor 32(%rbp),%xmm14
movdqu %xmm14,(%r13)
movdqa %xmm15,%xmm14
@@ -1520,7 +1520,7 @@ bsaes_ctr32_encrypt_blocks:
leaq 32(%rbp),%rdi
leaq 48(%rbp),%rsi
leaq (%r15),%rdx
- call asm_AES_encrypt
+ call aes_nohw_encrypt
movdqu (%r12),%xmm0
leaq 16(%r12),%r12
movl 44(%rbp),%eax
@@ -1602,7 +1602,7 @@ bsaes_xts_encrypt:
leaq (%r9),%rdi
leaq 32(%rbp),%rsi
leaq (%r8),%rdx
- call asm_AES_encrypt
+ call aes_nohw_encrypt
movl 240(%r15),%eax
movq %r14,%rbx
@@ -1972,7 +1972,7 @@ bsaes_xts_encrypt:
leaq 32(%rbp),%rdi
leaq 32(%rbp),%rsi
leaq (%r15),%rdx
- call asm_AES_encrypt
+ call aes_nohw_encrypt
pxor 32(%rbp),%xmm15
@@ -2005,7 +2005,7 @@ bsaes_xts_encrypt:
leaq 32(%rbp),%rsi
movdqa %xmm15,32(%rbp)
leaq (%r15),%rdx
- call asm_AES_encrypt
+ call aes_nohw_encrypt
pxor 32(%rbp),%xmm6
movdqu %xmm6,-16(%r13)
@@ -2077,7 +2077,7 @@ bsaes_xts_decrypt:
leaq (%r9),%rdi
leaq 32(%rbp),%rsi
leaq (%r8),%rdx
- call asm_AES_encrypt
+ call aes_nohw_encrypt
movl 240(%r15),%eax
movq %r14,%rbx
@@ -2454,7 +2454,7 @@ bsaes_xts_decrypt:
leaq 32(%rbp),%rdi
leaq 32(%rbp),%rsi
leaq (%r15),%rdx
- call asm_AES_decrypt
+ call aes_nohw_decrypt
pxor 32(%rbp),%xmm15
@@ -2485,7 +2485,7 @@ bsaes_xts_decrypt:
leaq 32(%rbp),%rsi
movdqa %xmm15,32(%rbp)
leaq (%r15),%rdx
- call asm_AES_decrypt
+ call aes_nohw_decrypt
pxor 32(%rbp),%xmm6
movq %r13,%rdx
movdqu %xmm6,(%r13)
@@ -2506,7 +2506,7 @@ bsaes_xts_decrypt:
leaq 32(%rbp),%rsi
movdqa %xmm15,32(%rbp)
leaq (%r15),%rdx
- call asm_AES_decrypt
+ call aes_nohw_decrypt
pxor 32(%rbp),%xmm5
movdqu %xmm5,(%r13)
diff --git a/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
index 64ef2c2d..574d55a5 100644
--- a/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
@@ -8,13 +8,27 @@
.type gcm_gmult_4bit,@function
.align 16
gcm_gmult_4bit:
+.cfi_startproc
pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
subq $280,%rsp
+.cfi_adjust_cfa_offset 280
.Lgmult_prologue:
movzbq 15(%rdi),%r8
@@ -92,23 +106,41 @@ gcm_gmult_4bit:
movq %r9,(%rdi)
leaq 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lgmult_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size gcm_gmult_4bit,.-gcm_gmult_4bit
.globl gcm_ghash_4bit
.hidden gcm_ghash_4bit
.type gcm_ghash_4bit,@function
.align 16
gcm_ghash_4bit:
+.cfi_startproc
pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
subq $280,%rsp
+.cfi_adjust_cfa_offset 280
.Lghash_prologue:
movq %rdx,%r14
movq %rcx,%r15
@@ -654,15 +686,24 @@ gcm_ghash_4bit:
movq %r9,(%rdi)
leaq 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq 0(%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lghash_epilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.globl gcm_init_clmul
.hidden gcm_init_clmul
diff --git a/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
index 7422d2ac..46a94838 100644
--- a/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
+++ b/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
@@ -93,6 +93,11 @@ ecp_nistz256_neg:
.align 32
ecp_nistz256_ord_mul_mont:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_mul_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -422,6 +427,11 @@ ecp_nistz256_ord_mul_mont:
.align 32
ecp_nistz256_ord_sqr_mont:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_sqr_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -709,6 +719,462 @@ ecp_nistz256_ord_sqr_mont:
.cfi_endproc
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_mul_montx,@function
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq .Lord-128(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,@function
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq .Lord(%rip),%rsi
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz .Loop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
@@ -720,6 +1186,9 @@ ecp_nistz256_ord_sqr_mont:
.align 32
ecp_nistz256_mul_mont:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
.Lmul_mont:
pushq %rbp
.cfi_adjust_cfa_offset 8
@@ -740,6 +1209,8 @@ ecp_nistz256_mul_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lmul_body:
+ cmpl $0x80100,%ecx
+ je .Lmul_montx
movq %rdx,%rbx
movq 0(%rdx),%rax
movq 0(%rsi),%r9
@@ -748,6 +1219,19 @@ ecp_nistz256_mul_mont:
movq 24(%rsi),%r12
call __ecp_nistz256_mul_montq
+ jmp .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_mul_montx
.Lmul_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -997,6 +1481,9 @@ __ecp_nistz256_mul_montq:
.align 32
ecp_nistz256_sqr_mont:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -1016,12 +1503,25 @@ ecp_nistz256_sqr_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lsqr_body:
+ cmpl $0x80100,%ecx
+ je .Lsqr_montx
movq 0(%rsi),%rax
movq 8(%rsi),%r14
movq 16(%rsi),%r15
movq 24(%rsi),%r8
call __ecp_nistz256_sqr_montq
+ jmp .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_sqr_montx
.Lsqr_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -1203,6 +1703,300 @@ __ecp_nistz256_sqr_montq:
.byte 0xf3,0xc3
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type __ecp_nistz256_mul_montx,@function
+.align 32
+__ecp_nistz256_mul_montx:
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ movq $32,%r14
+ xorq %r13,%r13
+ mulxq %r11,%rbp,%r11
+ movq .Lpoly+24(%rip),%r15
+ adcq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ adcq %rbp,%r10
+ shlxq %r14,%r8,%rbp
+ adcq %rcx,%r11
+ shrxq %r14,%r8,%rcx
+ adcq $0,%r12
+
+
+
+ addq %rbp,%r9
+ adcq %rcx,%r10
+
+ mulxq %r15,%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcq %rcx,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ adcxq %rcx,%r12
+ shlxq %r14,%r9,%rcx
+ adoxq %rbp,%r13
+ shrxq %r14,%r9,%rbp
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+
+ addq %rcx,%r10
+ adcq %rbp,%r11
+
+ mulxq %r15,%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcq %rcx,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ adcxq %rcx,%r13
+ shlxq %r14,%r10,%rcx
+ adoxq %rbp,%r8
+ shrxq %r14,%r10,%rbp
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+
+ addq %rcx,%r11
+ adcq %rbp,%r12
+
+ mulxq %r15,%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcq %rcx,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ adcxq %rcx,%r8
+ shlxq %r14,%r11,%rcx
+ adoxq %rbp,%r9
+ shrxq %r14,%r11,%rbp
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+
+ addq %rcx,%r12
+ adcq %rbp,%r13
+
+ mulxq %r15,%rcx,%rbp
+ movq %r12,%rbx
+ movq .Lpoly+8(%rip),%r14
+ adcq %rcx,%r8
+ movq %r13,%rdx
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+
+ xorl %eax,%eax
+ movq %r8,%rcx
+ sbbq $-1,%r12
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rbp
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rbp,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type __ecp_nistz256_sqr_montx,@function
+.align 32
+__ecp_nistz256_sqr_montx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ xorl %eax,%eax
+ adcq %rcx,%r10
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+
+ mulxq %r8,%rcx,%r14
+ movq 0+128(%rsi),%rdx
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+ mulxq %rdx,%r8,%rbp
+ movq 8+128(%rsi),%rdx
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+ movq 16+128(%rsi),%rdx
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+.byte 0x67
+ mulxq %rdx,%rcx,%rbp
+ movq 24+128(%rsi),%rdx
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ movq $32,%rsi
+ adoxq %rbp,%r13
+.byte 0x67,0x67
+ mulxq %rdx,%rcx,%rax
+ movq .Lpoly+24(%rip),%rdx
+ adoxq %rcx,%r14
+ shlxq %rsi,%r8,%rcx
+ adoxq %rax,%r15
+ shrxq %rsi,%r8,%rax
+ movq %rdx,%rbp
+
+
+ addq %rcx,%r9
+ adcq %rax,%r10
+
+ mulxq %r8,%rcx,%r8
+ adcq %rcx,%r11
+ shlxq %rsi,%r9,%rcx
+ adcq $0,%r8
+ shrxq %rsi,%r9,%rax
+
+
+ addq %rcx,%r10
+ adcq %rax,%r11
+
+ mulxq %r9,%rcx,%r9
+ adcq %rcx,%r8
+ shlxq %rsi,%r10,%rcx
+ adcq $0,%r9
+ shrxq %rsi,%r10,%rax
+
+
+ addq %rcx,%r11
+ adcq %rax,%r8
+
+ mulxq %r10,%rcx,%r10
+ adcq %rcx,%r9
+ shlxq %rsi,%r11,%rcx
+ adcq $0,%r10
+ shrxq %rsi,%r11,%rax
+
+
+ addq %rcx,%r8
+ adcq %rax,%r9
+
+ mulxq %r11,%rcx,%r11
+ adcq %rcx,%r10
+ adcq $0,%r11
+
+ xorq %rdx,%rdx
+ addq %r8,%r12
+ movq .Lpoly+8(%rip),%rsi
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %r11,%r15
+ movq %r13,%r9
+ adcq $0,%rdx
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%r11
+ sbbq %rbp,%r15
+ sbbq $0,%rdx
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %r11,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ .byte 0xf3,0xc3
+.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
.globl ecp_nistz256_select_w5
@@ -1588,6 +2382,11 @@ __ecp_nistz256_mul_by_2q:
.align 32
ecp_nistz256_point_double:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_doublex
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -1817,6 +2616,11 @@ ecp_nistz256_point_double:
.align 32
ecp_nistz256_point_add:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_addx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -2240,6 +3044,11 @@ ecp_nistz256_point_add:
.align 32
ecp_nistz256_point_add_affine:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_add_affinex
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -2563,4 +3372,1109 @@ ecp_nistz256_point_add_affine:
.byte 0xf3,0xc3
.cfi_endproc
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type __ecp_nistz256_add_tox,@function
+.align 32
+__ecp_nistz256_add_tox:
+ xorq %r11,%r11
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type __ecp_nistz256_sub_fromx,@function
+.align 32
+__ecp_nistz256_sub_fromx:
+ xorq %r11,%r11
+ sbbq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq $0,%r11
+
+ xorq %r10,%r10
+ adcq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+
+ btq $0,%r11
+ cmovncq %rax,%r12
+ cmovncq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovncq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovncq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type __ecp_nistz256_subx,@function
+.align 32
+__ecp_nistz256_subx:
+ xorq %r11,%r11
+ sbbq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq $0,%r11
+
+ xorq %r9,%r9
+ adcq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+
+ btq $0,%r11
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ cmovcq %rcx,%r8
+ cmovcq %r10,%r9
+
+ .byte 0xf3,0xc3
+.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type __ecp_nistz256_mul_by_2x,@function
+.align 32
+__ecp_nistz256_mul_by_2x:
+ xorq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type ecp_nistz256_point_doublex,@function
+.align 32
+ecp_nistz256_point_doublex:
+.cfi_startproc
+.Lpoint_doublex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-128(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 32(%rbx),%rdx
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rdx
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 0+32(%rsp),%rdx
+ movq 8+32(%rsp),%r14
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subx
+
+ movq 32(%rsp),%rdx
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-128(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doublex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type ecp_nistz256_point_addx,@function
+.align 32
+ecp_nistz256_point_addx:
+.cfi_startproc
+.Lpoint_addx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addx_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-128(%rsi),%rsi
+ movq %rdx,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rdx
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 416(%rsp),%rdx
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 512(%rsp),%rdx
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq -128+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 480(%rsp),%rdx
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 0x3e
+ jnz .Ladd_proceedx
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ testq %r8,%r8
+ jnz .Ladd_proceedx
+ testq %r9,%r9
+ jz .Ladd_doublex
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp .Ladd_donex
+
+.align 32
+.Ladd_doublex:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp .Lpoint_double_shortcutx
+
+.align 32
+.Ladd_proceedx:
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq -128+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0(%rsp),%rdx
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 160(%rsp),%rdx
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_donex:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type ecp_nistz256_point_add_affinex,@function
+.align 32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc
+.Lpoint_add_affinex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affinex_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-128(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rdx
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-128(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+96(%rsp),%rdx
+ movq 8+96(%rsp),%r14
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq -128+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rdx
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq -128+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affinex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
#endif
diff --git a/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S
index 7c1eeb72..86fa94c0 100644
--- a/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S
@@ -9,6 +9,7 @@
.type CRYPTO_rdrand,@function
.align 16
CRYPTO_rdrand:
+.cfi_startproc
xorq %rax,%rax
@@ -17,6 +18,7 @@ CRYPTO_rdrand:
adcq %rax,%rax
movq %rcx,0(%rdi)
.byte 0xf3,0xc3
+.cfi_endproc
@@ -27,6 +29,7 @@ CRYPTO_rdrand:
.type CRYPTO_rdrand_multiple8_buf,@function
.align 16
CRYPTO_rdrand_multiple8_buf:
+.cfi_startproc
testq %rsi,%rsi
jz .Lout
movq $8,%rdx
@@ -45,4 +48,5 @@ CRYPTO_rdrand_multiple8_buf:
.Lerr:
xorq %rax,%rax
.byte 0xf3,0xc3
+.cfi_endproc
#endif
diff --git a/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S
index 89b81ed8..e6db7f6e 100644
--- a/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S
+++ b/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S
@@ -1724,6 +1724,11 @@ rsaz_1024_gather5_avx2:
rsaz_avx2_eligible:
leaq OPENSSL_ia32cap_P(%rip),%rax
movl 8(%rax),%eax
+ movl $524544,%ecx
+ movl $0,%edx
+ andl %eax,%ecx
+ cmpl $524544,%ecx
+ cmovel %edx,%eax
andl $32,%eax
shrl $5,%eax
.byte 0xf3,0xc3
diff --git a/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
index 7f924dcc..186ac61a 100644
--- a/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
@@ -8,6 +8,7 @@
.type sha1_block_data_order,@function
.align 16
sha1_block_data_order:
+.cfi_startproc
leaq OPENSSL_ia32cap_P(%rip),%r10
movl 0(%r10),%r9d
movl 4(%r10),%r8d
@@ -24,17 +25,24 @@ sha1_block_data_order:
.align 16
.Lialu:
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
movq %rdi,%r8
subq $72,%rsp
movq %rsi,%r9
andq $-64,%rsp
movq %rdx,%r10
movq %rax,64(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
.Lprologue:
movl 0(%r8),%esi
@@ -1229,25 +1237,40 @@ sha1_block_data_order:
jnz .Lloop
movq 64(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha1_block_data_order,.-sha1_block_data_order
.type sha1_block_data_order_ssse3,@function
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
+.cfi_startproc
movq %rsp,%r11
+.cfi_def_cfa_register %r11
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
leaq -64(%rsp),%rsp
andq $-64,%rsp
movq %rdi,%r8
@@ -2404,24 +2427,38 @@ _ssse3_shortcut:
movl %edx,12(%r8)
movl %ebp,16(%r8)
movq -40(%r11),%r14
+.cfi_restore %r14
movq -32(%r11),%r13
+.cfi_restore %r13
movq -24(%r11),%r12
+.cfi_restore %r12
movq -16(%r11),%rbp
+.cfi_restore %rbp
movq -8(%r11),%rbx
+.cfi_restore %rbx
leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_ssse3:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
.type sha1_block_data_order_avx,@function
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
+.cfi_startproc
movq %rsp,%r11
+.cfi_def_cfa_register %r11
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
leaq -64(%rsp),%rsp
vzeroupper
andq $-64,%rsp
@@ -3518,13 +3555,20 @@ _avx_shortcut:
movl %edx,12(%r8)
movl %ebp,16(%r8)
movq -40(%r11),%r14
+.cfi_restore %r14
movq -32(%r11),%r13
+.cfi_restore %r13
movq -24(%r11),%r12
+.cfi_restore %r12
movq -16(%r11),%rbp
+.cfi_restore %rbp
movq -8(%r11),%rbx
+.cfi_restore %rbx
leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
.align 64
K_XX_XX:
diff --git a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
index 62534be4..175f218f 100644
--- a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -8,6 +8,7 @@
.type sha256_block_data_order,@function
.align 16
sha256_block_data_order:
+.cfi_startproc
leaq OPENSSL_ia32cap_P(%rip),%r11
movl 0(%r11),%r9d
movl 4(%r11),%r10d
@@ -20,12 +21,19 @@ sha256_block_data_order:
testl $512,%r10d
jnz .Lssse3_shortcut
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
shlq $4,%rdx
subq $64+32,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -33,7 +41,8 @@ sha256_block_data_order:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %rax,64+24(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
.Lprologue:
movl 0(%rdi),%eax
@@ -1697,16 +1706,25 @@ sha256_block_data_order:
movl %r11d,28(%rdi)
jb .Lloop
- movq 64+24(%rsp),%rsi
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha256_block_data_order,.-sha256_block_data_order
.align 64
.type K256,@object
@@ -1754,14 +1772,22 @@ K256:
.type sha256_block_data_order_ssse3,@function
.align 64
sha256_block_data_order_ssse3:
+.cfi_startproc
.Lssse3_shortcut:
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -1769,7 +1795,8 @@ sha256_block_data_order_ssse3:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %rax,64+24(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
.Lprologue_ssse3:
movl 0(%rdi),%eax
@@ -2835,28 +2862,45 @@ sha256_block_data_order_ssse3:
movl %r11d,28(%rdi)
jb .Lloop_ssse3
- movq 64+24(%rsp),%rsi
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_ssse3:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
.type sha256_block_data_order_avx,@function
.align 64
sha256_block_data_order_avx:
+.cfi_startproc
.Lavx_shortcut:
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
shlq $4,%rdx
subq $96,%rsp
leaq (%rsi,%rdx,4),%rdx
@@ -2864,7 +2908,8 @@ sha256_block_data_order_avx:
movq %rdi,64+0(%rsp)
movq %rsi,64+8(%rsp)
movq %rdx,64+16(%rsp)
- movq %rax,64+24(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
.Lprologue_avx:
vzeroupper
@@ -3891,16 +3936,25 @@ sha256_block_data_order_avx:
movl %r11d,28(%rdi)
jb .Lloop_avx
- movq 64+24(%rsp),%rsi
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
vzeroupper
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
#endif
diff --git a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
index 1f1793bb..be198bca 100644
--- a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
@@ -8,6 +8,7 @@
.type sha512_block_data_order,@function
.align 16
sha512_block_data_order:
+.cfi_startproc
leaq OPENSSL_ia32cap_P(%rip),%r11
movl 0(%r11),%r9d
movl 4(%r11),%r10d
@@ -20,12 +21,19 @@ sha512_block_data_order:
cmpl $1342177792,%r10d
je .Lavx_shortcut
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
shlq $4,%rdx
subq $128+32,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -33,7 +41,8 @@ sha512_block_data_order:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %rax,128+24(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
.Lprologue:
movq 0(%rdi),%rax
@@ -1697,16 +1706,25 @@ sha512_block_data_order:
movq %r11,56(%rdi)
jb .Lloop
- movq 128+24(%rsp),%rsi
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha512_block_data_order,.-sha512_block_data_order
.align 64
.type K512,@object
@@ -1798,14 +1816,22 @@ K512:
.type sha512_block_data_order_xop,@function
.align 64
sha512_block_data_order_xop:
+.cfi_startproc
.Lxop_shortcut:
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -1813,7 +1839,8 @@ sha512_block_data_order_xop:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %rax,128+24(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
.Lprologue_xop:
vzeroupper
@@ -2866,29 +2893,46 @@ sha512_block_data_order_xop:
movq %r11,56(%rdi)
jb .Lloop_xop
- movq 128+24(%rsp),%rsi
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
vzeroupper
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_xop:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
.type sha512_block_data_order_avx,@function
.align 64
sha512_block_data_order_avx:
+.cfi_startproc
.Lavx_shortcut:
movq %rsp,%rax
+.cfi_def_cfa_register %rax
pushq %rbx
+.cfi_offset %rbx,-16
pushq %rbp
+.cfi_offset %rbp,-24
pushq %r12
+.cfi_offset %r12,-32
pushq %r13
+.cfi_offset %r13,-40
pushq %r14
+.cfi_offset %r14,-48
pushq %r15
+.cfi_offset %r15,-56
shlq $4,%rdx
subq $160,%rsp
leaq (%rsi,%rdx,8),%rdx
@@ -2896,7 +2940,8 @@ sha512_block_data_order_avx:
movq %rdi,128+0(%rsp)
movq %rsi,128+8(%rsp)
movq %rdx,128+16(%rsp)
- movq %rax,128+24(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
.Lprologue_avx:
vzeroupper
@@ -4013,16 +4058,25 @@ sha512_block_data_order_avx:
movq %r11,56(%rdi)
jb .Lloop_avx
- movq 128+24(%rsp),%rsi
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
vzeroupper
movq -48(%rsi),%r15
+.cfi_restore %r15
movq -40(%rsi),%r14
+.cfi_restore %r14
movq -32(%rsi),%r13
+.cfi_restore %r13
movq -24(%rsi),%r12
+.cfi_restore %r12
movq -16(%rsi),%rbp
+.cfi_restore %rbp
movq -8(%rsi),%rbx
+.cfi_restore %rbx
leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
.byte 0xf3,0xc3
+.cfi_endproc
.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
#endif
diff --git a/linux-x86_64/crypto/fipsmodule/x86_64-mont.S b/linux-x86_64/crypto/fipsmodule/x86_64-mont.S
index 1f673ef8..3d867cd1 100644
--- a/linux-x86_64/crypto/fipsmodule/x86_64-mont.S
+++ b/linux-x86_64/crypto/fipsmodule/x86_64-mont.S
@@ -17,6 +17,8 @@ bn_mul_mont:
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
cmpq %rsi,%rdx
jne .Lmul4x_enter
testl $7,%r9d
@@ -208,30 +210,30 @@ bn_mul_mont:
xorq %r14,%r14
movq (%rsp),%rax
- leaq (%rsp),%rsi
movq %r9,%r15
- jmp .Lsub
+
.align 16
.Lsub: sbbq (%rcx,%r14,8),%rax
movq %rax,(%rdi,%r14,8)
- movq 8(%rsi,%r14,8),%rax
+ movq 8(%rsp,%r14,8),%rax
leaq 1(%r14),%r14
decq %r15
jnz .Lsub
sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
xorq %r14,%r14
- andq %rax,%rsi
- notq %rax
- movq %rdi,%rcx
- andq %rax,%rcx
movq %r9,%r15
- orq %rcx,%rsi
-.align 16
+
.Lcopy:
- movq (%rsi,%r14,8),%rax
- movq %r14,(%rsp,%r14,8)
- movq %rax,(%rdi,%r14,8)
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r9,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz .Lcopy
@@ -265,6 +267,9 @@ bn_mul4x_mont:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
+ andl $0x80100,%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -602,7 +607,6 @@ bn_mul4x_mont:
movq 16(%rsp,%r9,8),%rdi
leaq -4(%r9),%r15
movq 0(%rsp),%rax
- pxor %xmm0,%xmm0
movq 8(%rsp),%rdx
shrq $2,%r15
leaq (%rsp),%rsi
@@ -612,8 +616,7 @@ bn_mul4x_mont:
movq 16(%rsi),%rbx
movq 24(%rsi),%rbp
sbbq 8(%rcx),%rdx
- jmp .Lsub4x
-.align 16
+
.Lsub4x:
movq %rax,0(%rdi,%r14,8)
movq %rdx,8(%rdi,%r14,8)
@@ -640,34 +643,35 @@ bn_mul4x_mont:
sbbq $0,%rax
movq %rbp,24(%rdi,%r14,8)
- xorq %r14,%r14
- andq %rax,%rsi
- notq %rax
- movq %rdi,%rcx
- andq %rax,%rcx
- leaq -4(%r9),%r15
- orq %rcx,%rsi
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,224
+ pcmpeqd %xmm5,%xmm5
+ pshufd $0,%xmm4,%xmm4
+ movq %r9,%r15
+ pxor %xmm4,%xmm5
shrq $2,%r15
+ xorl %eax,%eax
- movdqu (%rsi),%xmm1
- movdqa %xmm0,(%rsp)
- movdqu %xmm1,(%rdi)
jmp .Lcopy4x
.align 16
.Lcopy4x:
- movdqu 16(%rsi,%r14,1),%xmm2
- movdqu 32(%rsi,%r14,1),%xmm1
- movdqa %xmm0,16(%rsp,%r14,1)
- movdqu %xmm2,16(%rdi,%r14,1)
- movdqa %xmm0,32(%rsp,%r14,1)
- movdqu %xmm1,32(%rdi,%r14,1)
- leaq 32(%r14),%r14
+ movdqa (%rsp,%rax,1),%xmm1
+ movdqu (%rdi,%rax,1),%xmm2
+ pand %xmm4,%xmm1
+ pand %xmm5,%xmm2
+ movdqa 16(%rsp,%rax,1),%xmm3
+ movdqa %xmm0,(%rsp,%rax,1)
+ por %xmm2,%xmm1
+ movdqu 16(%rdi,%rax,1),%xmm2
+ movdqu %xmm1,(%rdi,%rax,1)
+ pand %xmm4,%xmm3
+ pand %xmm5,%xmm2
+ movdqa %xmm0,16(%rsp,%rax,1)
+ por %xmm2,%xmm3
+ movdqu %xmm3,16(%rdi,%rax,1)
+ leaq 32(%rax),%rax
decq %r15
jnz .Lcopy4x
-
- movdqu 16(%rsi,%r14,1),%xmm2
- movdqa %xmm0,16(%rsp,%r14,1)
- movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
.cfi_def_cfa %rsi, 8
movq $1,%rax
@@ -689,6 +693,8 @@ bn_mul4x_mont:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_mul4x_mont,.-bn_mul4x_mont
+.extern bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
.extern bn_sqr8x_internal
.hidden bn_sqr8x_internal
@@ -773,6 +779,26 @@ bn_sqr8x_mont:
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movl 8(%rax),%eax
+ andl $0x80100,%eax
+ cmpl $0x80100,%eax
+ jne .Lsqr8x_nox
+
+ call bn_sqrx8x_internal
+
+
+
+
+ leaq (%r8,%rcx,1),%rbx
+ movq %rcx,%r9
+ movq %rcx,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_nox:
call bn_sqr8x_internal
@@ -860,6 +886,362 @@ bn_sqr8x_mont:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_sqr8x_mont,.-bn_sqr8x_mont
+.type bn_mulx4x_mont,@function
+.align 32
+bn_mulx4x_mont:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,1),%rbp
+ andq $-128,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+ leaq (%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r9,0(%rsp)
+ shrq $5,%r9
+ movq %r10,16(%rsp)
+ subq $1,%r9
+ movq %r8,24(%rsp)
+ movq %rdi,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+ movq %r9,48(%rsp)
+ jmp .Lmulx4x_body
+
+.align 32
+.Lmulx4x_body:
+ leaq 8(%rdx),%rdi
+ movq (%rdx),%rdx
+ leaq 64+32(%rsp),%rbx
+ movq %rdx,%r9
+
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r14
+ addq %rax,%r11
+ movq %rdi,8(%rsp)
+ mulxq 16(%rsi),%r12,%r13
+ adcq %r14,%r12
+ adcq $0,%r13
+
+ movq %r8,%rdi
+ imulq 24(%rsp),%r8
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%rdi
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+ movq 48(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ addq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ movq (%rdi),%rdx
+ leaq 8(%rdi),%rdi
+ subq %rax,%rsi
+ movq %r15,(%rbx)
+ leaq 64+32(%rsp),%rbx
+ subq %rax,%rcx
+
+ mulxq 0(%rsi),%r8,%r11
+ xorl %ebp,%ebp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ adoxq -16(%rbx),%r12
+ adcxq %rbp,%r13
+ adoxq %rbp,%r13
+
+ movq %rdi,8(%rsp)
+ movq %r8,%r15
+ imulq 24(%rsp),%r8
+ xorl %ebp,%ebp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ adoxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ leaq 32(%rcx),%rcx
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ movq 48(%rsp),%rdi
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-32(%rbx)
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ subq 0(%rbx),%rbp
+ adcq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+
+ cmpq 16(%rsp),%rdi
+ jne .Lmulx4x_outer
+
+ leaq 64(%rsp),%rbx
+ subq %rax,%rcx
+ negq %r15
+ movq %rax,%rdx
+ shrq $3+2,%rax
+ movq 32(%rsp),%rdi
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ movq 0(%rbx),%r11
+ movq 8(%rbx),%r12
+ movq 16(%rbx),%r13
+ movq 24(%rbx),%r14
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rcx),%r11
+ sbbq 8(%rcx),%r12
+ sbbq 16(%rcx),%r13
+ sbbq 24(%rcx),%r14
+ leaq 32(%rcx),%rcx
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+ movq %r13,16(%rdi)
+ movq %r14,24(%rdi)
+ leaq 32(%rdi),%rdi
+ decq %rax
+ jnz .Lmulx4x_sub
+
+ sbbq $0,%r15
+ leaq 64(%rsp),%rbx
+ subq %rdx,%rdi
+
+.byte 102,73,15,110,207
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ jmp .Lmulx4x_cond_copy
+
+.align 32
+.Lmulx4x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ subq $32,%rdx
+ jnz .Lmulx4x_cond_copy
+
+ movq %rdx,(%rbx)
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mulx4x_mont,.-bn_mulx4x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
#endif
diff --git a/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S b/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S
index 1ec58ca0..bc4e2747 100644
--- a/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S
+++ b/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S
@@ -15,6 +15,8 @@ bn_mul_mont_gather5:
.cfi_def_cfa_register %rax
testl $7,%r9d
jnz .Lmul_enter
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
jmp .Lmul4x_enter
.align 16
@@ -404,18 +406,19 @@ bn_mul_mont_gather5:
jnz .Lsub
sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
xorq %r14,%r14
- andq %rax,%rsi
- notq %rax
- movq %rdi,%rcx
- andq %rax,%rcx
movq %r9,%r15
- orq %rcx,%rsi
-.align 16
+
.Lcopy:
- movq (%rsi,%r14,8),%rax
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
movq %r14,(%rsp,%r14,8)
- movq %rax,(%rdi,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz .Lcopy
@@ -450,6 +453,9 @@ bn_mul4x_mont_gather5:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -1078,6 +1084,11 @@ bn_power5:
.cfi_startproc
movq %rsp,%rax
.cfi_def_cfa_register %rax
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lpowerx5_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -2163,6 +2174,22 @@ bn_from_mont8x:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ jne .Lfrom_mont_nox
+
+ leaq (%rax,%r9,1),%rdi
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_nox:
call __bn_sqr8x_reduction
call __bn_post4x_internal
@@ -2201,6 +2228,1343 @@ bn_from_mont8x:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
+.type bn_mulx4x_mont_gather5,@function
+.align 32
+bn_mulx4x_mont_gather5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmulx4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lmulx4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type mulx4x_internal,@function
+.align 32
+mulx4x_internal:
+ movq %r9,8(%rsp)
+ movq %r9,%r10
+ negq %r9
+ shlq $5,%r9
+ negq %r10
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5+5,%r9
+ movd 8(%rax),%xmm5
+ subq $1,%r9
+ leaq .Linc(%rip),%rax
+ movq %r13,16+8(%rsp)
+ movq %r9,24+8(%rsp)
+ movq %rdi,56+8(%rsp)
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r10,1),%r10
+ leaq 128(%rdx),%rdi
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67
+ movdqa %xmm1,%xmm2
+.byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+.byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+
+ pand 64(%rdi),%xmm0
+ pand 80(%rdi),%xmm1
+ pand 96(%rdi),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%rdi),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%rdi),%xmm4
+ movdqa -112(%rdi),%xmm5
+ movdqa -96(%rdi),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%rdi),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%rdi),%xmm4
+ movdqa -48(%rdi),%xmm5
+ movdqa -32(%rdi),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%rdi),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%rdi),%xmm4
+ movdqa 16(%rdi),%xmm5
+ movdqa 32(%rdi),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%rdi),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ pxor %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+ leaq 64+32+8(%rsp),%rbx
+
+ movq %rdx,%r9
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r12
+ addq %rax,%r11
+ mulxq 16(%rsi),%rax,%r13
+ adcq %rax,%r12
+ adcq $0,%r13
+ mulxq 24(%rsi),%rax,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+ xorq %rbp,%rbp
+ movq %r8,%rdx
+
+ movq %rdi,8+8(%rsp)
+
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 8(%rsp),%rax
+ adcq %rbp,%r15
+ leaq (%rsi,%rax,1),%rsi
+ addq %r15,%r14
+ movq 8+8(%rsp),%rdi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ leaq 16-256(%rbx),%r10
+ pxor %xmm4,%xmm4
+.byte 0x67,0x67
+ pxor %xmm5,%xmm5
+ movdqa -128(%rdi),%xmm0
+ movdqa -112(%rdi),%xmm1
+ movdqa -96(%rdi),%xmm2
+ pand 256(%r10),%xmm0
+ movdqa -80(%rdi),%xmm3
+ pand 272(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 288(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 304(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%rdi),%xmm0
+ movdqa -48(%rdi),%xmm1
+ movdqa -32(%rdi),%xmm2
+ pand 320(%r10),%xmm0
+ movdqa -16(%rdi),%xmm3
+ pand 336(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 352(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 368(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%rdi),%xmm0
+ movdqa 16(%rdi),%xmm1
+ movdqa 32(%rdi),%xmm2
+ pand 384(%r10),%xmm0
+ movdqa 48(%rdi),%xmm3
+ pand 400(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 416(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 432(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%rdi),%xmm0
+ movdqa 80(%rdi),%xmm1
+ movdqa 96(%rdi),%xmm2
+ pand 448(%r10),%xmm0
+ movdqa 112(%rdi),%xmm3
+ pand 464(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 480(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 496(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+
+ movq %rbp,(%rbx)
+ leaq 32(%rbx,%rax,1),%rbx
+ mulxq 0(%rsi),%r8,%r11
+ xorq %rbp,%rbp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ mulxq 24(%rsi),%rdx,%r14
+ adoxq -16(%rbx),%r12
+ adcxq %rdx,%r13
+ leaq (%rcx,%rax,1),%rcx
+ leaq 32(%rsi),%rsi
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ adoxq %rbp,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+
+ movq %r8,%rdx
+ xorq %rbp,%rbp
+ movq %rdi,8+8(%rsp)
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-24(%rbx)
+ adoxq %rbp,%r15
+ movq %r12,-16(%rbx)
+ leaq 32(%rcx),%rcx
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ movq %r11,-32(%rbx)
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ leaq 32(%rcx),%rcx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0+8(%rsp),%rax
+ adcq %rbp,%r15
+ subq 0(%rbx),%rdi
+ movq 8+8(%rsp),%rdi
+ movq 16+8(%rsp),%r10
+ adcq %r15,%r14
+ leaq (%rsi,%rax,1),%rsi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+
+ cmpq %r10,%rdi
+ jb .Lmulx4x_outer
+
+ movq -8(%rcx),%r10
+ movq %rbp,%r8
+ movq (%rcx,%rax,1),%r12
+ leaq (%rcx,%rax,1),%rbp
+ movq %rax,%rcx
+ leaq (%rbx,%rax,1),%rdi
+ xorl %eax,%eax
+ xorq %r15,%r15
+ subq %r14,%r10
+ adcq %r15,%r15
+ orq %r15,%r8
+ sarq $3+2,%rcx
+ subq %r8,%rax
+ movq 56+8(%rsp),%rdx
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+.size mulx4x_internal,.-mulx4x_internal
+.type bn_powerx5,@function
+.align 32
+bn_powerx5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lpowerx5_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lpowerx5_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwrx_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lpwrx_sp_done
+
+.align 32
+.Lpwrx_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lpwrx_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+
+ movq %r10,%r9
+ movq %rsi,%rdi
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpowerx5_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_powerx5,.-bn_powerx5
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,@function
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 48+8(%rsp),%rdi
+ leaq (%rsi,%r9,1),%rbp
+ movq %r9,0+8(%rsp)
+ movq %rbp,8+8(%rsp)
+ jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte 0x3e
+ movdqa %xmm0,0(%rdi)
+ movdqa %xmm0,16(%rdi)
+ movdqa %xmm0,32(%rdi)
+ movdqa %xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+ movdqa %xmm0,64(%rdi)
+ movdqa %xmm0,80(%rdi)
+ movdqa %xmm0,96(%rdi)
+ movdqa %xmm0,112(%rdi)
+ leaq 128(%rdi),%rdi
+ subq $64,%r9
+ jnz .Lsqrx8x_zero
+
+ movq 0(%rsi),%rdx
+
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ leaq 48+8(%rsp),%rdi
+ xorq %rbp,%rbp
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+ mulxq 8(%rsi),%r8,%rax
+ adcxq %r9,%r8
+ adoxq %rax,%r10
+ mulxq 16(%rsi),%r9,%rax
+ adcxq %r10,%r9
+ adoxq %rax,%r11
+.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+ adcxq %r11,%r10
+ adoxq %rax,%r12
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+ adcxq %r12,%r11
+ adoxq %rax,%r13
+ mulxq 40(%rsi),%r12,%rax
+ adcxq %r13,%r12
+ adoxq %rax,%r14
+ mulxq 48(%rsi),%r13,%rax
+ adcxq %r14,%r13
+ adoxq %r15,%rax
+ mulxq 56(%rsi),%r14,%r15
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r14
+ adoxq %rbp,%r15
+ adcq 64(%rdi),%r15
+ movq %r8,8(%rdi)
+ movq %r9,16(%rdi)
+ sbbq %rcx,%rcx
+ xorq %rbp,%rbp
+
+
+ mulxq 16(%rsi),%r8,%rbx
+ mulxq 24(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 32(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %rbx,%r11
+.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adcxq %r13,%r11
+ adoxq %r14,%r12
+.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+ movq 16(%rsi),%rdx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ adcxq %r15,%r13
+ adoxq %rbp,%r14
+ adcxq %rbp,%r14
+
+ movq %r8,24(%rdi)
+ movq %r9,32(%rdi)
+
+ mulxq 24(%rsi),%r8,%rbx
+ mulxq 32(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 40(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %r13,%r11
+.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte 0x3e
+ movq 24(%rsi),%rdx
+ adcxq %rbx,%r11
+ adoxq %rax,%r12
+ adcxq %r14,%r12
+ movq %r8,40(%rdi)
+ movq %r9,48(%rdi)
+ mulxq 32(%rsi),%r8,%rax
+ adoxq %rbp,%r13
+ adcxq %rbp,%r13
+
+ mulxq 40(%rsi),%r9,%rbx
+ adcxq %r10,%r8
+ adoxq %rax,%r9
+ mulxq 48(%rsi),%r10,%rax
+ adcxq %r11,%r9
+ adoxq %r12,%r10
+ mulxq 56(%rsi),%r11,%r12
+ movq 32(%rsi),%rdx
+ movq 40(%rsi),%r14
+ adcxq %rbx,%r10
+ adoxq %rax,%r11
+ movq 48(%rsi),%r15
+ adcxq %r13,%r11
+ adoxq %rbp,%r12
+ adcxq %rbp,%r12
+
+ movq %r8,56(%rdi)
+ movq %r9,64(%rdi)
+
+ mulxq %r14,%r9,%rax
+ movq 56(%rsi),%r8
+ adcxq %r10,%r9
+ mulxq %r15,%r10,%rbx
+ adoxq %rax,%r10
+ adcxq %r11,%r10
+ mulxq %r8,%r11,%rax
+ movq %r14,%rdx
+ adoxq %rbx,%r11
+ adcxq %r12,%r11
+
+ adcxq %rbp,%rax
+
+ mulxq %r15,%r14,%rbx
+ mulxq %r8,%r12,%r13
+ movq %r15,%rdx
+ leaq 64(%rsi),%rsi
+ adcxq %r14,%r11
+ adoxq %rbx,%r12
+ adcxq %rax,%r12
+ adoxq %rbp,%r13
+
+.byte 0x67,0x67
+ mulxq %r8,%r8,%r14
+ adcxq %r8,%r13
+ adcxq %rbp,%r14
+
+ cmpq 8+8(%rsp),%rsi
+ je .Lsqrx8x_outer_break
+
+ negq %rcx
+ movq $-8,%rcx
+ movq %rbp,%r15
+ movq 64(%rdi),%r8
+ adcxq 72(%rdi),%r9
+ adcxq 80(%rdi),%r10
+ adcxq 88(%rdi),%r11
+ adcq 96(%rdi),%r12
+ adcq 104(%rdi),%r13
+ adcq 112(%rdi),%r14
+ adcq 120(%rdi),%r15
+ leaq (%rsi),%rbp
+ leaq 128(%rdi),%rdi
+ sbbq %rax,%rax
+
+ movq -64(%rsi),%rdx
+ movq %rax,16+8(%rsp)
+ movq %rdi,24+8(%rsp)
+
+
+ xorl %eax,%eax
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ movq %rbx,(%rdi,%rcx,8)
+ movl $0,%ebx
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+ movq 8(%rsi,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rbx,%r15
+ adcxq %rbx,%r15
+
+.byte 0x67
+ incq %rcx
+ jnz .Lsqrx8x_loop
+
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ cmpq 8+8(%rsp),%rbp
+ je .Lsqrx8x_break
+
+ subq 16+8(%rsp),%rbx
+.byte 0x66
+ movq -64(%rsi),%rdx
+ adcxq 0(%rdi),%r8
+ adcxq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+.byte 0x67
+ sbbq %rax,%rax
+ xorl %ebx,%ebx
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+ xorq %rbp,%rbp
+ subq 16+8(%rsp),%rbx
+ adcxq %rbp,%r8
+ movq 24+8(%rsp),%rcx
+ adcxq %rbp,%r9
+ movq 0(%rsi),%rdx
+ adcq $0,%r10
+ movq %r8,0(%rdi)
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ cmpq %rcx,%rdi
+ je .Lsqrx8x_outer_loop
+
+ movq %r9,8(%rdi)
+ movq 8(%rcx),%r9
+ movq %r10,16(%rdi)
+ movq 16(%rcx),%r10
+ movq %r11,24(%rdi)
+ movq 24(%rcx),%r11
+ movq %r12,32(%rdi)
+ movq 32(%rcx),%r12
+ movq %r13,40(%rdi)
+ movq 40(%rcx),%r13
+ movq %r14,48(%rdi)
+ movq 48(%rcx),%r14
+ movq %r15,56(%rdi)
+ movq 56(%rcx),%r15
+ movq %rcx,%rdi
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+ movq %r9,72(%rdi)
+.byte 102,72,15,126,217
+ movq %r10,80(%rdi)
+ movq %r11,88(%rdi)
+ movq %r12,96(%rdi)
+ movq %r13,104(%rdi)
+ movq %r14,112(%rdi)
+ leaq 48+8(%rsp),%rdi
+ movq (%rsi,%rcx,1),%rdx
+
+ movq 8(%rdi),%r11
+ xorq %r10,%r10
+ movq 0+8(%rsp),%r9
+ adoxq %r11,%r11
+ movq 16(%rdi),%r12
+ movq 24(%rdi),%r13
+
+
+.align 32
+.Lsqrx4x_shift_n_add:
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 40(%rdi),%r11
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ movq 16(%rsi,%rcx,1),%rdx
+ movq 48(%rdi),%r12
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 56(%rdi),%r13
+ movq %rax,16(%rdi)
+ movq %rbx,24(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+ movq 24(%rsi,%rcx,1),%rdx
+ leaq 32(%rcx),%rcx
+ movq 64(%rdi),%r10
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 72(%rdi),%r11
+ movq %rax,32(%rdi)
+ movq %rbx,40(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ jrcxz .Lsqrx4x_shift_n_add_break
+.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 80(%rdi),%r12
+ movq 88(%rdi),%r13
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+ nop
+ jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+ adcxq %r13,%rbx
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+.byte 102,72,15,126,213
+__bn_sqrx8x_reduction:
+ xorl %eax,%eax
+ movq 32+8(%rsp),%rbx
+ movq 48+8(%rsp),%rdx
+ leaq -64(%rbp,%r9,1),%rcx
+
+ movq %rcx,0+8(%rsp)
+ movq %rdi,8+8(%rsp)
+
+ leaq 48+8(%rsp),%rdi
+ jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq %rdx,%r8
+ imulq %rbx,%rdx
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,24+8(%rsp)
+
+ leaq 64(%rdi),%rdi
+ xorq %rsi,%rsi
+ movq $-8,%rcx
+ jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rbx,%r9
+ adcxq %rbx,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 32+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+ movq %rax,64+48+8(%rsp,%rcx,8)
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+.byte 0x67,0x67,0x67
+ incq %rcx
+ jnz .Lsqrx8x_reduce
+
+ movq %rsi,%rax
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_no_tail
+
+ movq 48+8(%rsp),%rdx
+ addq 0(%rdi),%r8
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ adcxq 8(%rdi),%r9
+ adcxq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq 72+48+8(%rsp,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ movq %rbx,(%rdi,%rcx,8)
+ movq %r8,%rbx
+ adcxq %rsi,%r15
+
+ incq %rcx
+ jnz .Lsqrx8x_tail
+
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_tail_done
+
+ subq 16+8(%rsp),%rsi
+ movq 48+8(%rsp),%rdx
+ leaq 64(%rbp),%rbp
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+ subq $8,%rcx
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+ xorq %rax,%rax
+ addq 24+8(%rsp),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ subq 16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+ adcq 0(%rdi),%r8
+.byte 102,72,15,126,217
+ adcq 8(%rdi),%r9
+ movq 56(%rbp),%rsi
+.byte 102,72,15,126,213
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+
+ movq 32+8(%rsp),%rbx
+ movq 64(%rdi,%rcx,1),%rdx
+
+ movq %r8,0(%rdi)
+ leaq 64(%rdi),%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 64(%rdi,%rcx,1),%rdi
+ cmpq 8+8(%rsp),%r8
+ jb .Lsqrx8x_reduction_loop
+ .byte 0xf3,0xc3
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align 32
+__bn_postx4x_internal:
+ movq 0(%rbp),%r12
+ movq %rcx,%r10
+ movq %rcx,%r9
+ negq %rax
+ sarq $3+2,%rcx
+
+.byte 102,72,15,126,202
+.byte 102,72,15,126,206
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+
+.align 16
+.Lsqrx4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+ andnq %rax,%r12,%r12
+ leaq 32(%rbp),%rbp
+ andnq %rax,%r13,%r13
+ andnq %rax,%r14,%r14
+ andnq %rax,%r15,%r15
+
+ negq %r8
+ adcq 0(%rdi),%r12
+ adcq 8(%rdi),%r13
+ adcq 16(%rdi),%r14
+ adcq 24(%rdi),%r15
+ movq %r12,0(%rdx)
+ leaq 32(%rdi),%rdi
+ movq %r13,8(%rdx)
+ sbbq %r8,%r8
+ movq %r14,16(%rdx)
+ movq %r15,24(%rdx)
+ leaq 32(%rdx),%rdx
+
+ incq %rcx
+ jnz .Lsqrx4x_sub
+
+ negq %r9
+
+ .byte 0xf3,0xc3
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
.globl bn_scatter5
.hidden bn_scatter5
.type bn_scatter5,@function