summaryrefslogtreecommitdiff
path: root/linux-x86_64/crypto/chacha/chacha-x86_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'linux-x86_64/crypto/chacha/chacha-x86_64.S')
-rw-r--r--linux-x86_64/crypto/chacha/chacha-x86_64.S73
1 files changed, 37 insertions, 36 deletions
diff --git a/linux-x86_64/crypto/chacha/chacha-x86_64.S b/linux-x86_64/crypto/chacha/chacha-x86_64.S
index e994940a..25ec715f 100644
--- a/linux-x86_64/crypto/chacha/chacha-x86_64.S
+++ b/linux-x86_64/crypto/chacha/chacha-x86_64.S
@@ -23,6 +23,15 @@
.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
.Lsigma:
.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.globl ChaCha20_ctr32
.hidden ChaCha20_ctr32
@@ -42,6 +51,7 @@ ChaCha20_ctr32:
pushq %r14
pushq %r15
subq $64+24,%rsp
+.Lctr32_body:
movdqu (%rcx),%xmm1
@@ -279,13 +289,14 @@ ChaCha20_ctr32:
jnz .Loop_tail
.Ldone:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq 64+24+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
.Lno_data:
.byte 0xf3,0xc3
.size ChaCha20_ctr32,.-ChaCha20_ctr32
@@ -293,18 +304,12 @@ ChaCha20_ctr32:
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
+ movq %rsp,%r9
cmpq $128,%rdx
ja .LChaCha20_4x
.Ldo_sse3_after_all:
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- subq $64+24,%rsp
+ subq $64+8,%rsp
movdqa .Lsigma(%rip),%xmm0
movdqu (%rcx),%xmm1
movdqu 16(%rcx),%xmm2
@@ -316,7 +321,7 @@ ChaCha20_ssse3:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- movl $10,%ebp
+ movq $10,%r8
jmp .Loop_ssse3
.align 32
@@ -326,7 +331,7 @@ ChaCha20_ssse3:
movdqa 16(%rsp),%xmm1
movdqa 32(%rsp),%xmm2
paddd 48(%rsp),%xmm3
- movl $10,%ebp
+ movq $10,%r8
movdqa %xmm3,48(%rsp)
jmp .Loop_ssse3
@@ -375,7 +380,7 @@ ChaCha20_ssse3:
pshufd $78,%xmm2,%xmm2
pshufd $147,%xmm1,%xmm1
pshufd $57,%xmm3,%xmm3
- decl %ebp
+ decq %r8
jnz .Loop_ssse3
paddd 0(%rsp),%xmm0
paddd 16(%rsp),%xmm1
@@ -412,31 +417,27 @@ ChaCha20_ssse3:
movdqa %xmm1,16(%rsp)
movdqa %xmm2,32(%rsp)
movdqa %xmm3,48(%rsp)
- xorq %rbx,%rbx
+ xorq %r8,%r8
.Loop_tail_ssse3:
- movzbl (%rsi,%rbx,1),%eax
- movzbl (%rsp,%rbx,1),%ecx
- leaq 1(%rbx),%rbx
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
xorl %ecx,%eax
- movb %al,-1(%rdi,%rbx,1)
+ movb %al,-1(%rdi,%r8,1)
decq %rdx
jnz .Loop_tail_ssse3
.Ldone_ssse3:
- addq $64+24,%rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
+ leaq (%r9),%rsp
+.Lssse3_epilogue:
.byte 0xf3,0xc3
.size ChaCha20_ssse3,.-ChaCha20_ssse3
.type ChaCha20_4x,@function
.align 32
ChaCha20_4x:
.LChaCha20_4x:
+ movq %rsp,%r9
movq %r10,%r11
shrq $32,%r10
testq $32,%r10
@@ -449,8 +450,7 @@ ChaCha20_4x:
je .Ldo_sse3_after_all
.Lproceed4x:
- leaq -120(%rsp),%r11
- subq $0x148+0,%rsp
+ subq $0x140+8,%rsp
movdqa .Lsigma(%rip),%xmm11
movdqu (%rcx),%xmm15
movdqu 16(%rcx),%xmm7
@@ -977,18 +977,18 @@ ChaCha20_4x:
jnz .Loop_tail4x
.Ldone4x:
- addq $0x148+0,%rsp
+ leaq (%r9),%rsp
+.L4x_epilogue:
.byte 0xf3,0xc3
.size ChaCha20_4x,.-ChaCha20_4x
.type ChaCha20_8x,@function
.align 32
ChaCha20_8x:
.LChaCha20_8x:
- movq %rsp,%r10
+ movq %rsp,%r9
subq $0x280+8,%rsp
andq $-32,%rsp
vzeroupper
- movq %r10,640(%rsp)
@@ -1579,7 +1579,8 @@ ChaCha20_8x:
.Ldone8x:
vzeroall
- movq 640(%rsp),%rsp
+ leaq (%r9),%rsp
+.L8x_epilogue:
.byte 0xf3,0xc3
.size ChaCha20_8x,.-ChaCha20_8x
#endif