diff options
Diffstat (limited to 'mac-x86_64/crypto/bn/x86_64-mont.S')
-rw-r--r-- | mac-x86_64/crypto/bn/x86_64-mont.S | 237 |
1 files changed, 176 insertions, 61 deletions
diff --git a/mac-x86_64/crypto/bn/x86_64-mont.S b/mac-x86_64/crypto/bn/x86_64-mont.S index 51e5d199..41a09267 100644 --- a/mac-x86_64/crypto/bn/x86_64-mont.S +++ b/mac-x86_64/crypto/bn/x86_64-mont.S @@ -8,6 +8,10 @@ .p2align 4 _bn_mul_mont: + + movl %r9d,%r9d + movq %rsp,%rax + testl $3,%r9d jnz L$mul_enter cmpl $8,%r9d @@ -21,20 +25,50 @@ _bn_mul_mont: .p2align 4 L$mul_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movl %r9d,%r9d - leaq 2(%r9),%r10 + + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +.p2align 4 +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) - movq %r11,8(%rsp,%r9,8) L$mul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -186,51 +220,86 @@ L$sub: sbbq (%rcx,%r14,8),%rax sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .p2align 4 L$copy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 4 bn_mul4x_mont: + + movl %r9d,%r9d + movq %rsp,%rax + L$mul4x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movl %r9d,%r9d - leaq 4(%r9),%r10 + + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) - movq %r11,8(%rsp,%r9,8) L$mul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -530,9 +599,11 @@ L$inner4x: cmpq %r9,%r14 jb L$outer4x movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 movq 0(%rsp),%rax + pxor %xmm0,%xmm0 movq 8(%rsp),%rdx - shrq $2,%r9 + shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 @@ -540,7 +611,6 @@ L$inner4x: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - leaq -1(%r9),%r15 jmp L$sub4x .p2align 4 L$sub4x: @@ -568,62 +638,79 @@ L$sub4x: movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax - movq %rax,%xmm0 - punpcklqdq %xmm0,%xmm0 movq %rbp,24(%rdi,%r14,8) xorq %r14,%r14 - - movq %r9,%r15 - pxor %xmm5,%xmm5 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -4(%r9),%r15 + orq %rcx,%rsi + shrq $2,%r15 + + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) jmp L$copy4x .p2align 4 L$copy4x: - movdqu (%rsp,%r14,1),%xmm2 - movdqu 16(%rsp,%r14,1),%xmm4 - movdqu (%rdi,%r14,1),%xmm1 - movdqu 16(%rdi,%r14,1),%xmm3 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - pand %xmm0,%xmm2 - pand %xmm0,%xmm4 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - movdqu %xmm2,(%rdi,%r14,1) - movdqu %xmm4,16(%rdi,%r14,1) - movdqa %xmm5,(%rsp,%r14,1) - movdqa %xmm5,16(%rsp,%r14,1) - + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) leaq 32(%r14),%r14 decq %r15 jnz L$copy4x - shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi + movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_sqr8x_mont: -L$sqr8x_enter: + movq %rsp,%rax + +L$sqr8x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$sqr8x_prologue: + movl %r9d,%r10d shll $3,%r9d shlq $3+2,%r10 @@ -635,30 +722,49 @@ L$sqr8x_enter: leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$sqr8x_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk + jmp L$sqr8x_page_walk_done + +.p2align 4 +L$sqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk +L$sqr8x_page_walk_done: + movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) + L$sqr8x_body: .byte 102,72,15,110,209 @@ -705,6 +811,7 @@ L$sqr8x_sub: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi + jmp L$sqr8x_cond_copy .p2align 5 @@ -734,15 +841,23 @@ L$sqr8x_cond_copy: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$sqr8x_epilogue: .byte 0xf3,0xc3 + .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif |