diff options
author | Adam Vartanian <flooey@google.com> | 2018-08-10 14:55:24 +0100 |
---|---|---|
committer | Adam Vartanian <flooey@google.com> | 2018-08-10 15:32:39 +0100 |
commit | bfcf3a72c0bcb62cfde80e932db0668a7f96c0f8 (patch) | |
tree | 3ab110046e45c25f8f69df137f8a9d4a52bf60b5 /linux-x86_64 | |
parent | 6120b2b9dfbe5b19fe2d0f69a4b8418ad5f7fe08 (diff) | |
download | boringssl-bfcf3a72c0bcb62cfde80e932db0668a7f96c0f8.tar.gz |
external/boringssl: Sync to 8625ec4b436ccb4098ed4aac10891eff8372be41.
This includes the following changes:
https://boringssl.googlesource.com/boringssl/+log/c596415ec62b501523d80f9afa26b135406da6bf..8625ec4b436ccb4098ed4aac10891eff8372be41
Test: cts -m CtsLibcoreTestCases
Change-Id: I47a45e6b6f46b19fcbcb6c917895867d56dcd2ca
Diffstat (limited to 'linux-x86_64')
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/aes-x86_64.S | 56 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/aesni-x86_64.S | 130 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S | 34 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/ghash-x86_64.S | 41 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S | 1914 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S | 4 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/rsaz-avx2.S | 5 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/sha1-x86_64.S | 44 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/sha256-x86_64.S | 66 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/sha512-x86_64.S | 66 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/x86_64-mont.S | 454 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/x86_64-mont5.S | 1380 |
12 files changed, 4028 insertions, 166 deletions
diff --git a/linux-x86_64/crypto/fipsmodule/aes-x86_64.S b/linux-x86_64/crypto/fipsmodule/aes-x86_64.S index 0928c170..65bcbf48 100644 --- a/linux-x86_64/crypto/fipsmodule/aes-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -327,11 +327,11 @@ _x86_64_AES_encrypt_compact: .byte 0xf3,0xc3 .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact .align 16 -.globl asm_AES_encrypt -.hidden asm_AES_encrypt -.type asm_AES_encrypt,@function -.hidden asm_AES_encrypt -asm_AES_encrypt: +.globl aes_nohw_encrypt +.hidden aes_nohw_encrypt +.type aes_nohw_encrypt,@function +.hidden aes_nohw_encrypt +aes_nohw_encrypt: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax @@ -409,7 +409,7 @@ asm_AES_encrypt: .Lenc_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size asm_AES_encrypt,.-asm_AES_encrypt +.size aes_nohw_encrypt,.-aes_nohw_encrypt .type _x86_64_AES_decrypt,@function .align 16 _x86_64_AES_decrypt: @@ -791,11 +791,11 @@ _x86_64_AES_decrypt_compact: .byte 0xf3,0xc3 .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact .align 16 -.globl asm_AES_decrypt -.hidden asm_AES_decrypt -.type asm_AES_decrypt,@function -.hidden asm_AES_decrypt -asm_AES_decrypt: +.globl aes_nohw_decrypt +.hidden aes_nohw_decrypt +.type aes_nohw_decrypt,@function +.hidden aes_nohw_decrypt +aes_nohw_decrypt: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax @@ -875,12 +875,12 @@ asm_AES_decrypt: .Ldec_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size asm_AES_decrypt,.-asm_AES_decrypt +.size aes_nohw_decrypt,.-aes_nohw_decrypt .align 16 -.globl asm_AES_set_encrypt_key -.hidden asm_AES_set_encrypt_key -.type asm_AES_set_encrypt_key,@function -asm_AES_set_encrypt_key: +.globl aes_nohw_set_encrypt_key +.hidden aes_nohw_set_encrypt_key +.type aes_nohw_set_encrypt_key,@function +aes_nohw_set_encrypt_key: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 @@ -915,7 +915,7 @@ asm_AES_set_encrypt_key: .Lenc_key_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size asm_AES_set_encrypt_key,.-asm_AES_set_encrypt_key +.size aes_nohw_set_encrypt_key,.-aes_nohw_set_encrypt_key .type _x86_64_AES_set_encrypt_key,@function .align 16 @@ -1157,10 +1157,10 @@ _x86_64_AES_set_encrypt_key: .byte 0xf3,0xc3 .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key .align 16 -.globl asm_AES_set_decrypt_key -.hidden asm_AES_set_decrypt_key -.type asm_AES_set_decrypt_key,@function -asm_AES_set_decrypt_key: +.globl aes_nohw_set_decrypt_key +.hidden aes_nohw_set_decrypt_key +.type aes_nohw_set_decrypt_key,@function +aes_nohw_set_decrypt_key: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 @@ -1365,15 +1365,15 @@ asm_AES_set_decrypt_key: .Ldec_key_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size asm_AES_set_decrypt_key,.-asm_AES_set_decrypt_key +.size aes_nohw_set_decrypt_key,.-aes_nohw_set_decrypt_key .align 16 -.globl asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt -.type asm_AES_cbc_encrypt,@function +.globl aes_nohw_cbc_encrypt +.hidden aes_nohw_cbc_encrypt +.type aes_nohw_cbc_encrypt,@function .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P -.hidden asm_AES_cbc_encrypt -asm_AES_cbc_encrypt: +.hidden aes_nohw_cbc_encrypt +aes_nohw_cbc_encrypt: .cfi_startproc cmpq $0,%rdx je .Lcbc_epilogue @@ -1850,7 +1850,7 @@ asm_AES_cbc_encrypt: .Lcbc_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size asm_AES_cbc_encrypt,.-asm_AES_cbc_encrypt +.size aes_nohw_cbc_encrypt,.-aes_nohw_cbc_encrypt .align 64 .LAES_Te: .long 0xa56363c6,0xa56363c6 diff --git a/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S b/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S index 53d594ec..fabf92be 100644 --- a/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -2,11 +2,11 @@ .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P -.globl aesni_encrypt -.hidden aesni_encrypt -.type aesni_encrypt,@function +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function .align 16 -aesni_encrypt: +aes_hw_encrypt: movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -25,13 +25,13 @@ aesni_encrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 -.size aesni_encrypt,.-aesni_encrypt +.size aes_hw_encrypt,.-aes_hw_encrypt -.globl aesni_decrypt -.hidden aesni_decrypt -.type aesni_decrypt,@function +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function .align 16 -aesni_decrypt: +aes_hw_decrypt: movups (%rdi),%xmm2 movl 240(%rdx),%eax movups (%rdx),%xmm0 @@ -50,7 +50,7 @@ aesni_decrypt: movups %xmm2,(%rsi) pxor %xmm2,%xmm2 .byte 0xf3,0xc3 -.size aesni_decrypt, .-aesni_decrypt +.size aes_hw_decrypt, .-aes_hw_decrypt .type _aesni_encrypt2,@function .align 16 _aesni_encrypt2: @@ -493,11 +493,11 @@ _aesni_decrypt8: .byte 102,68,15,56,223,200 .byte 0xf3,0xc3 .size _aesni_decrypt8,.-_aesni_decrypt8 -.globl aesni_ecb_encrypt -.hidden aesni_ecb_encrypt -.type aesni_ecb_encrypt,@function +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function .align 16 -aesni_ecb_encrypt: +aes_hw_ecb_encrypt: andq $-16,%rdx jz .Lecb_ret @@ -835,12 +835,12 @@ aesni_ecb_encrypt: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 .byte 0xf3,0xc3 -.size aesni_ecb_encrypt,.-aesni_ecb_encrypt -.globl aesni_ccm64_encrypt_blocks -.hidden aesni_ccm64_encrypt_blocks -.type aesni_ccm64_encrypt_blocks,@function +.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt +.globl aes_hw_ccm64_encrypt_blocks +.hidden aes_hw_ccm64_encrypt_blocks +.type aes_hw_ccm64_encrypt_blocks,@function .align 16 -aesni_ccm64_encrypt_blocks: +aes_hw_ccm64_encrypt_blocks: movl 240(%rcx),%eax movdqu (%r8),%xmm6 movdqa .Lincrement64(%rip),%xmm9 @@ -899,12 +899,12 @@ aesni_ccm64_encrypt_blocks: pxor %xmm8,%xmm8 pxor %xmm6,%xmm6 .byte 0xf3,0xc3 -.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks -.globl aesni_ccm64_decrypt_blocks -.hidden aesni_ccm64_decrypt_blocks -.type aesni_ccm64_decrypt_blocks,@function +.size aes_hw_ccm64_encrypt_blocks,.-aes_hw_ccm64_encrypt_blocks +.globl aes_hw_ccm64_decrypt_blocks +.hidden aes_hw_ccm64_decrypt_blocks +.type aes_hw_ccm64_decrypt_blocks,@function .align 16 -aesni_ccm64_decrypt_blocks: +aes_hw_ccm64_decrypt_blocks: movl 240(%rcx),%eax movups (%r8),%xmm6 movdqu (%r9),%xmm3 @@ -997,12 +997,12 @@ aesni_ccm64_decrypt_blocks: pxor %xmm8,%xmm8 pxor %xmm6,%xmm6 .byte 0xf3,0xc3 -.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks -.globl aesni_ctr32_encrypt_blocks -.hidden aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,@function +.size aes_hw_ccm64_decrypt_blocks,.-aes_hw_ccm64_decrypt_blocks +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function .align 16 -aesni_ctr32_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: .cfi_startproc cmpq $1,%rdx jne .Lctr32_bulk @@ -1577,12 +1577,12 @@ aesni_ctr32_encrypt_blocks: .Lctr32_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks -.globl aesni_xts_encrypt -.hidden aesni_xts_encrypt -.type aesni_xts_encrypt,@function +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_xts_encrypt +.hidden aes_hw_xts_encrypt +.type aes_hw_xts_encrypt,@function .align 16 -aesni_xts_encrypt: +aes_hw_xts_encrypt: .cfi_startproc leaq (%rsp),%r11 .cfi_def_cfa_register %r11 @@ -2048,12 +2048,12 @@ aesni_xts_encrypt: .Lxts_enc_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size aesni_xts_encrypt,.-aesni_xts_encrypt -.globl aesni_xts_decrypt -.hidden aesni_xts_decrypt -.type aesni_xts_decrypt,@function +.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt +.globl aes_hw_xts_decrypt +.hidden aes_hw_xts_decrypt +.type aes_hw_xts_decrypt,@function .align 16 -aesni_xts_decrypt: +aes_hw_xts_decrypt: .cfi_startproc leaq (%rsp),%r11 .cfi_def_cfa_register %r11 @@ -2556,12 +2556,12 @@ aesni_xts_decrypt: .Lxts_dec_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size aesni_xts_decrypt,.-aesni_xts_decrypt -.globl aesni_ocb_encrypt -.hidden aesni_ocb_encrypt -.type aesni_ocb_encrypt,@function +.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt +.globl aes_hw_ocb_encrypt +.hidden aes_hw_ocb_encrypt +.type aes_hw_ocb_encrypt,@function .align 32 -aesni_ocb_encrypt: +aes_hw_ocb_encrypt: .cfi_startproc leaq (%rsp),%rax pushq %rbx @@ -2771,7 +2771,7 @@ aesni_ocb_encrypt: .Locb_enc_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size aesni_ocb_encrypt,.-aesni_ocb_encrypt +.size aes_hw_ocb_encrypt,.-aes_hw_ocb_encrypt .type __ocb_encrypt6,@function .align 32 @@ -2979,11 +2979,11 @@ __ocb_encrypt1: .byte 0xf3,0xc3 .size __ocb_encrypt1,.-__ocb_encrypt1 -.globl aesni_ocb_decrypt -.hidden aesni_ocb_decrypt -.type aesni_ocb_decrypt,@function +.globl aes_hw_ocb_decrypt +.hidden aes_hw_ocb_decrypt +.type aes_hw_ocb_decrypt,@function .align 32 -aesni_ocb_decrypt: +aes_hw_ocb_decrypt: .cfi_startproc leaq (%rsp),%rax pushq %rbx @@ -3215,7 +3215,7 @@ aesni_ocb_decrypt: .Locb_dec_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size aesni_ocb_decrypt,.-aesni_ocb_decrypt +.size aes_hw_ocb_decrypt,.-aes_hw_ocb_decrypt .type __ocb_decrypt6,@function .align 32 @@ -3411,11 +3411,11 @@ __ocb_decrypt1: .byte 102,15,56,223,215 .byte 0xf3,0xc3 .size __ocb_decrypt1,.-__ocb_decrypt1 -.globl aesni_cbc_encrypt -.hidden aesni_cbc_encrypt -.type aesni_cbc_encrypt,@function +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function .align 16 -aesni_cbc_encrypt: +aes_hw_cbc_encrypt: .cfi_startproc testq %rdx,%rdx jz .Lcbc_ret @@ -4003,12 +4003,12 @@ aesni_cbc_encrypt: .Lcbc_ret: .byte 0xf3,0xc3 .cfi_endproc -.size aesni_cbc_encrypt,.-aesni_cbc_encrypt -.globl aesni_set_decrypt_key -.hidden aesni_set_decrypt_key -.type aesni_set_decrypt_key,@function +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,@function .align 16 -aesni_set_decrypt_key: +aes_hw_set_decrypt_key: .cfi_startproc .byte 0x48,0x83,0xEC,0x08 .cfi_adjust_cfa_offset 8 @@ -4048,12 +4048,12 @@ aesni_set_decrypt_key: .byte 0xf3,0xc3 .cfi_endproc .LSEH_end_set_decrypt_key: -.size aesni_set_decrypt_key,.-aesni_set_decrypt_key -.globl aesni_set_encrypt_key -.hidden aesni_set_encrypt_key -.type aesni_set_encrypt_key,@function +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,@function .align 16 -aesni_set_encrypt_key: +aes_hw_set_encrypt_key: __aesni_set_encrypt_key: .cfi_startproc .byte 0x48,0x83,0xEC,0x08 @@ -4424,7 +4424,7 @@ __aesni_set_encrypt_key: shufps $170,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 -.size aesni_set_encrypt_key,.-aesni_set_encrypt_key +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key .align 64 .Lbswap_mask: diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S index 7c293efe..eb5c8cbf 100644 --- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -1,10 +1,10 @@ #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text -.extern asm_AES_encrypt -.hidden asm_AES_encrypt -.extern asm_AES_decrypt -.hidden asm_AES_decrypt +.extern aes_nohw_encrypt +.hidden aes_nohw_encrypt +.extern aes_nohw_decrypt +.hidden aes_nohw_decrypt .type _bsaes_encrypt8,@function .align 64 @@ -1065,8 +1065,8 @@ _bsaes_key_convert: .byte 0xf3,0xc3 .size _bsaes_key_convert,.-_bsaes_key_convert -.extern asm_AES_cbc_encrypt -.hidden asm_AES_cbc_encrypt +.extern aes_nohw_cbc_encrypt +.hidden aes_nohw_cbc_encrypt .globl bsaes_cbc_encrypt .hidden bsaes_cbc_encrypt .type bsaes_cbc_encrypt,@function @@ -1074,9 +1074,9 @@ _bsaes_key_convert: bsaes_cbc_encrypt: .cfi_startproc cmpl $0,%r9d - jne asm_AES_cbc_encrypt + jne aes_nohw_cbc_encrypt cmpq $128,%rdx - jb asm_AES_cbc_encrypt + jb aes_nohw_cbc_encrypt movq %rsp,%rax .Lcbc_dec_prologue: @@ -1304,7 +1304,7 @@ bsaes_cbc_encrypt: leaq (%r12),%rdi leaq 32(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_decrypt + call aes_nohw_decrypt pxor 32(%rbp),%xmm14 movdqu %xmm14,(%r13) movdqa %xmm15,%xmm14 @@ -1520,7 +1520,7 @@ bsaes_ctr32_encrypt_blocks: leaq 32(%rbp),%rdi leaq 48(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_encrypt + call aes_nohw_encrypt movdqu (%r12),%xmm0 leaq 16(%r12),%r12 movl 44(%rbp),%eax @@ -1602,7 +1602,7 @@ bsaes_xts_encrypt: leaq (%r9),%rdi leaq 32(%rbp),%rsi leaq (%r8),%rdx - call asm_AES_encrypt + call aes_nohw_encrypt movl 240(%r15),%eax movq %r14,%rbx @@ -1972,7 +1972,7 @@ bsaes_xts_encrypt: leaq 32(%rbp),%rdi leaq 32(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_encrypt + call aes_nohw_encrypt pxor 32(%rbp),%xmm15 @@ -2005,7 +2005,7 @@ bsaes_xts_encrypt: leaq 32(%rbp),%rsi movdqa %xmm15,32(%rbp) leaq (%r15),%rdx - call asm_AES_encrypt + call aes_nohw_encrypt pxor 32(%rbp),%xmm6 movdqu %xmm6,-16(%r13) @@ -2077,7 +2077,7 @@ bsaes_xts_decrypt: leaq (%r9),%rdi leaq 32(%rbp),%rsi leaq (%r8),%rdx - call asm_AES_encrypt + call aes_nohw_encrypt movl 240(%r15),%eax movq %r14,%rbx @@ -2454,7 +2454,7 @@ bsaes_xts_decrypt: leaq 32(%rbp),%rdi leaq 32(%rbp),%rsi leaq (%r15),%rdx - call asm_AES_decrypt + call aes_nohw_decrypt pxor 32(%rbp),%xmm15 @@ -2485,7 +2485,7 @@ bsaes_xts_decrypt: leaq 32(%rbp),%rsi movdqa %xmm15,32(%rbp) leaq (%r15),%rdx - call asm_AES_decrypt + call aes_nohw_decrypt pxor 32(%rbp),%xmm6 movq %r13,%rdx movdqu %xmm6,(%r13) @@ -2506,7 +2506,7 @@ bsaes_xts_decrypt: leaq 32(%rbp),%rsi movdqa %xmm15,32(%rbp) leaq (%r15),%rdx - call asm_AES_decrypt + call aes_nohw_decrypt pxor 32(%rbp),%xmm5 movdqu %xmm5,(%r13) diff --git a/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S index 64ef2c2d..574d55a5 100644 --- a/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -8,13 +8,27 @@ .type gcm_gmult_4bit,@function .align 16 gcm_gmult_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -92,23 +106,41 @@ gcm_gmult_4bit: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit .globl gcm_ghash_4bit .hidden gcm_ghash_4bit .type gcm_ghash_4bit,@function .align 16 gcm_ghash_4bit: +.cfi_startproc pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 subq $280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 @@ -654,15 +686,24 @@ gcm_ghash_4bit: movq %r9,(%rdi) leaq 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit .globl gcm_init_clmul .hidden gcm_init_clmul diff --git a/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 7422d2ac..46a94838 100644 --- a/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ b/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -93,6 +93,11 @@ ecp_nistz256_neg: .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_mul_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -422,6 +427,11 @@ ecp_nistz256_ord_mul_mont: .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_sqr_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -709,6 +719,462 @@ ecp_nistz256_ord_sqr_mont: .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_mul_montx,@function +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,@function +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx + @@ -720,6 +1186,9 @@ ecp_nistz256_ord_sqr_mont: .align 32 ecp_nistz256_mul_mont: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx .Lmul_mont: pushq %rbp .cfi_adjust_cfa_offset 8 @@ -740,6 +1209,8 @@ ecp_nistz256_mul_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: + cmpl $0x80100,%ecx + je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -748,6 +1219,19 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp .Lmul_mont_done + +.align 32 +.Lmul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx .Lmul_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -997,6 +1481,9 @@ __ecp_nistz256_mul_montq: .align 32 ecp_nistz256_sqr_mont: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1016,12 +1503,25 @@ ecp_nistz256_sqr_mont: .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: + cmpl $0x80100,%ecx + je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp .Lsqr_mont_done + +.align 32 +.Lsqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx .Lsqr_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 @@ -1203,6 +1703,300 @@ __ecp_nistz256_sqr_montq: .byte 0xf3,0xc3 .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx .globl ecp_nistz256_select_w5 @@ -1588,6 +2382,11 @@ __ecp_nistz256_mul_by_2q: .align 32 ecp_nistz256_point_double: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_doublex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1817,6 +2616,11 @@ ecp_nistz256_point_double: .align 32 ecp_nistz256_point_add: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_addx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2240,6 +3044,11 @@ ecp_nistz256_point_add: .align 32 ecp_nistz256_point_add_affine: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%rcx + movq 8(%rcx),%rcx + andl $0x80100,%ecx + cmpl $0x80100,%ecx + je .Lpoint_add_affinex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2563,4 +3372,1109 @@ ecp_nistz256_point_add_affine: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.type __ecp_nistz256_add_tox,@function +.align 32 +__ecp_nistz256_add_tox: + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,@function +.align 32 +__ecp_nistz256_sub_fromx: + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,@function +.align 32 +__ecp_nistz256_subx: + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + .byte 0xf3,0xc3 +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,@function +.align 32 +__ecp_nistz256_mul_by_2x: + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.type ecp_nistz256_point_doublex,@function +.align 32 +ecp_nistz256_point_doublex: +.cfi_startproc +.Lpoint_doublex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: + +.Lpoint_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex +.type ecp_nistz256_point_addx,@function +.align 32 +ecp_nistz256_point_addx: +.cfi_startproc +.Lpoint_addx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz .Ladd_proceedx +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz .Ladd_proceedx + testq %r9,%r9 + jz .Ladd_doublex + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_donex + +.align 32 +.Ladd_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp .Lpoint_double_shortcutx + +.align 32 +.Ladd_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_donex: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx +.type ecp_nistz256_point_add_affinex,@function +.align 32 +ecp_nistz256_point_add_affinex: +.cfi_startproc +.Lpoint_add_affinex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex #endif diff --git a/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S index 7c1eeb72..86fa94c0 100644 --- a/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -9,6 +9,7 @@ .type CRYPTO_rdrand,@function .align 16 CRYPTO_rdrand: +.cfi_startproc xorq %rax,%rax @@ -17,6 +18,7 @@ CRYPTO_rdrand: adcq %rax,%rax movq %rcx,0(%rdi) .byte 0xf3,0xc3 +.cfi_endproc @@ -27,6 +29,7 @@ CRYPTO_rdrand: .type CRYPTO_rdrand_multiple8_buf,@function .align 16 CRYPTO_rdrand_multiple8_buf: +.cfi_startproc testq %rsi,%rsi jz .Lout movq $8,%rdx @@ -45,4 +48,5 @@ CRYPTO_rdrand_multiple8_buf: .Lerr: xorq %rax,%rax .byte 0xf3,0xc3 +.cfi_endproc #endif diff --git a/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S index 89b81ed8..e6db7f6e 100644 --- a/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ b/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -1724,6 +1724,11 @@ rsaz_1024_gather5_avx2: rsaz_avx2_eligible: leaq OPENSSL_ia32cap_P(%rip),%rax movl 8(%rax),%eax + movl $524544,%ecx + movl $0,%edx + andl %eax,%ecx + cmpl $524544,%ecx + cmovel %edx,%eax andl $32,%eax shrl $5,%eax .byte 0xf3,0xc3 diff --git a/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S index 7f924dcc..186ac61a 100644 --- a/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -8,6 +8,7 @@ .type sha1_block_data_order,@function .align 16 sha1_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r10 movl 0(%r10),%r9d movl 4(%r10),%r8d @@ -24,17 +25,24 @@ sha1_block_data_order: .align 16 .Lialu: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%r8),%esi @@ -1229,25 +1237,40 @@ sha1_block_data_order: jnz .Lloop movq 64(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: +.cfi_startproc movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp andq $-64,%rsp movq %rdi,%r8 @@ -2404,24 +2427,38 @@ _ssse3_shortcut: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 +.cfi_restore %r14 movq -32(%r11),%r13 +.cfi_restore %r13 movq -24(%r11),%r12 +.cfi_restore %r12 movq -16(%r11),%rbp +.cfi_restore %rbp movq -8(%r11),%rbx +.cfi_restore %rbx leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 .type sha1_block_data_order_avx,@function .align 16 sha1_block_data_order_avx: _avx_shortcut: +.cfi_startproc movq %rsp,%r11 +.cfi_def_cfa_register %r11 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 leaq -64(%rsp),%rsp vzeroupper andq $-64,%rsp @@ -3518,13 +3555,20 @@ _avx_shortcut: movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 +.cfi_restore %r14 movq -32(%r11),%r13 +.cfi_restore %r13 movq -24(%r11),%r12 +.cfi_restore %r12 movq -16(%r11),%rbp +.cfi_restore %rbp movq -8(%r11),%rbx +.cfi_restore %rbx leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx .align 64 K_XX_XX: diff --git a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S index 62534be4..175f218f 100644 --- a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -8,6 +8,7 @@ .type sha256_block_data_order,@function .align 16 sha256_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -20,12 +21,19 @@ sha256_block_data_order: testl $512,%r10d jnz .Lssse3_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -33,7 +41,8 @@ sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%rdi),%eax @@ -1697,16 +1706,25 @@ sha256_block_data_order: movl %r11d,28(%rdi) jb .Lloop - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order,.-sha256_block_data_order .align 64 .type K256,@object @@ -1754,14 +1772,22 @@ K256: .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: +.cfi_startproc .Lssse3_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1769,7 +1795,8 @@ sha256_block_data_order_ssse3: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_ssse3: movl 0(%rdi),%eax @@ -2835,28 +2862,45 @@ sha256_block_data_order_ssse3: movl %r11d,28(%rdi) jb .Lloop_ssse3 - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 .type sha256_block_data_order_avx,@function .align 64 sha256_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2864,7 +2908,8 @@ sha256_block_data_order_avx: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %rax,64+24(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -3891,16 +3936,25 @@ sha256_block_data_order_avx: movl %r11d,28(%rdi) jb .Lloop_avx - movq 64+24(%rsp),%rsi + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif diff --git a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S index 1f1793bb..be198bca 100644 --- a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -8,6 +8,7 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: +.cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d @@ -20,12 +21,19 @@ sha512_block_data_order: cmpl $1342177792,%r10d je .Lavx_shortcut movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +41,8 @@ sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: movq 0(%rdi),%rax @@ -1697,16 +1706,25 @@ sha512_block_data_order: movq %r11,56(%rdi) jb .Lloop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order,.-sha512_block_data_order .align 64 .type K512,@object @@ -1798,14 +1816,22 @@ K512: .type sha512_block_data_order_xop,@function .align 64 sha512_block_data_order_xop: +.cfi_startproc .Lxop_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1813,7 +1839,8 @@ sha512_block_data_order_xop: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_xop: vzeroupper @@ -2866,29 +2893,46 @@ sha512_block_data_order_xop: movq %r11,56(%rdi) jb .Lloop_xop - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_xop,.-sha512_block_data_order_xop .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: +.cfi_startproc .Lavx_shortcut: movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +2940,8 @@ sha512_block_data_order_avx: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %rax,128+24(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue_avx: vzeroupper @@ -4013,16 +4058,25 @@ sha512_block_data_order_avx: movq %r11,56(%rdi) jb .Lloop_avx - movq 128+24(%rsp),%rsi + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 vzeroupper movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: .byte 0xf3,0xc3 +.cfi_endproc .size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif diff --git a/linux-x86_64/crypto/fipsmodule/x86_64-mont.S b/linux-x86_64/crypto/fipsmodule/x86_64-mont.S index 1f673ef8..3d867cd1 100644 --- a/linux-x86_64/crypto/fipsmodule/x86_64-mont.S +++ b/linux-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -17,6 +17,8 @@ bn_mul_mont: jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d @@ -208,30 +210,30 @@ bn_mul_mont: xorq %r14,%r14 movq (%rsp),%rax - leaq (%rsp),%rsi movq %r9,%r15 - jmp .Lsub + .align 16 .Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax + movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz .Lsub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.align 16 + .Lcopy: - movq (%rsi,%r14,8),%rax - movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy @@ -265,6 +267,9 @@ bn_mul4x_mont: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -602,7 +607,6 @@ bn_mul4x_mont: movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax - pxor %xmm0,%xmm0 movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi @@ -612,8 +616,7 @@ bn_mul4x_mont: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - jmp .Lsub4x -.align 16 + .Lsub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) @@ -640,34 +643,35 @@ bn_mul4x_mont: sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) - xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx - leaq -4(%r9),%r15 - orq %rcx,%rsi + pxor %xmm0,%xmm0 +.byte 102,72,15,110,224 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 shrq $2,%r15 + xorl %eax,%eax - movdqu (%rsi),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,(%rdi) jmp .Lcopy4x .align 16 .Lcopy4x: - movdqu 16(%rsi,%r14,1),%xmm2 - movdqu 32(%rsi,%r14,1),%xmm1 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) - movdqa %xmm0,32(%rsp,%r14,1) - movdqu %xmm1,32(%rdi,%r14,1) - leaq 32(%r14),%r14 + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax decq %r15 jnz .Lcopy4x - - movdqu 16(%rsi,%r14,1),%xmm2 - movdqa %xmm0,16(%rsp,%r14,1) - movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi, 8 movq $1,%rax @@ -689,6 +693,8 @@ bn_mul4x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont +.extern bn_sqrx8x_internal +.hidden bn_sqrx8x_internal .extern bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -773,6 +779,26 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + leaq OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: call bn_sqr8x_internal @@ -860,6 +886,362 @@ bn_sqr8x_mont: .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 #endif diff --git a/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S b/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S index 1ec58ca0..bc4e2747 100644 --- a/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ b/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -15,6 +15,8 @@ bn_mul_mont_gather5: .cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d jmp .Lmul4x_enter .align 16 @@ -404,18 +406,19 @@ bn_mul_mont_gather5: jnz .Lsub sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx xorq %r14,%r14 - andq %rax,%rsi - notq %rax - movq %rdi,%rcx - andq %rax,%rcx movq %r9,%r15 - orq %rcx,%rsi -.align 16 + .Lcopy: - movq (%rsi,%r14,8),%rax + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx movq %r14,(%rsp,%r14,8) - movq %rax,(%rdi,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy @@ -450,6 +453,9 @@ bn_mul4x_mont_gather5: movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -1078,6 +1084,11 @@ bn_power5: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp @@ -2163,6 +2174,22 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 8(%r11),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne .Lfrom_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal @@ -2201,6 +2228,1343 @@ bn_from_mont8x: .byte 0xf3,0xc3 .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.size mulx4x_internal,.-mulx4x_internal +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + .byte 0xf3,0xc3 +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +__bn_postx4x_internal: + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 +.size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_scatter5 .hidden bn_scatter5 .type bn_scatter5,@function |