summaryrefslogtreecommitdiff
path: root/linux-x86_64
diff options
context:
space:
mode:
authorDavid Benjamin <davidben@google.com>2016-04-22 15:02:23 -0400
committerDavid Benjamin <davidben@google.com>2016-04-29 16:36:16 -0400
commit4969cc9b0ab2905ec478277f50ed3849b37a6c6b (patch)
tree552fde383dce1efd213ae145fff808806d76a225 /linux-x86_64
parent09f2501f7faf115dc26e0c2310b3ea8c97f66007 (diff)
downloadboringssl-4969cc9b0ab2905ec478277f50ed3849b37a6c6b.tar.gz
external/boringssl: Sync to d18cb77.
This includes the following changes which are far too many to list here: https://boringssl.googlesource.com/boringssl/+log/7b8b9c17db93ea5287575b437c77fb36eeb81b31..d18cb77864dcc4b5c7cb08c2331008c01165f34f This also retires one function from android_compat_hacks.c which is no longer necessary. Change-Id: Ie00536d7ad815464b2b031f7bcd1b683e12c1623
Diffstat (limited to 'linux-x86_64')
-rw-r--r--linux-x86_64/crypto/aes/aes-x86_64.S70
-rw-r--r--linux-x86_64/crypto/aes/aesni-x86_64.S82
-rw-r--r--linux-x86_64/crypto/aes/bsaes-x86_64.S158
-rw-r--r--linux-x86_64/crypto/aes/vpaes-x86_64.S20
-rw-r--r--linux-x86_64/crypto/bn/rsaz-x86_64.S218
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont.S108
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont5.S901
-rw-r--r--linux-x86_64/crypto/chacha/chacha-x86_64.S1585
-rw-r--r--linux-x86_64/crypto/ec/p256-x86_64-asm.S27
-rw-r--r--linux-x86_64/crypto/md5/md5-x86_64.S34
-rw-r--r--linux-x86_64/crypto/modes/ghash-x86_64.S34
11 files changed, 2682 insertions, 555 deletions
diff --git a/linux-x86_64/crypto/aes/aes-x86_64.S b/linux-x86_64/crypto/aes/aes-x86_64.S
index 5f4b057f..361e84c7 100644
--- a/linux-x86_64/crypto/aes/aes-x86_64.S
+++ b/linux-x86_64/crypto/aes/aes-x86_64.S
@@ -82,8 +82,8 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $65280,%edi
- andl $65280,%ebp
+ andl $0x0000ff00,%edi
+ andl $0x0000ff00,%ebp
xorl %edi,%r10d
xorl %ebp,%r11d
@@ -95,8 +95,8 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rsi,8),%esi
movl 0(%r14,%rdi,8),%edi
- andl $65280,%esi
- andl $65280,%edi
+ andl $0x0000ff00,%esi
+ andl $0x0000ff00,%edi
shrl $16,%ebx
xorl %esi,%r12d
xorl %edi,%r8d
@@ -109,9 +109,9 @@ _x86_64_AES_encrypt:
movl 0(%r14,%rdi,8),%edi
movl 0(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $16711680,%edi
- andl $16711680,%ebp
+ andl $0x00ff0000,%esi
+ andl $0x00ff0000,%edi
+ andl $0x00ff0000,%ebp
xorl %esi,%r10d
xorl %edi,%r11d
@@ -124,9 +124,9 @@ _x86_64_AES_encrypt:
movl 2(%r14,%rdi,8),%edi
movl 2(%r14,%rbp,8),%ebp
- andl $16711680,%esi
- andl $4278190080,%edi
- andl $4278190080,%ebp
+ andl $0x00ff0000,%esi
+ andl $0xff000000,%edi
+ andl $0xff000000,%ebp
xorl %esi,%r8d
xorl %edi,%r10d
@@ -139,8 +139,8 @@ _x86_64_AES_encrypt:
movl 2(%r14,%rdi,8),%edi
movl 16+0(%r15),%eax
- andl $4278190080,%esi
- andl $4278190080,%edi
+ andl $0xff000000,%esi
+ andl $0xff000000,%edi
xorl %esi,%r12d
xorl %edi,%r8d
@@ -242,8 +242,8 @@ _x86_64_AES_encrypt_compact:
xorl %r8d,%edx
cmpq 16(%rsp),%r15
je .Lenc_compact_done
- movl $2155905152,%r10d
- movl $2155905152,%r11d
+ movl $0x80808080,%r10d
+ movl $0x80808080,%r11d
andl %eax,%r10d
andl %ebx,%r11d
movl %r10d,%esi
@@ -254,10 +254,10 @@ _x86_64_AES_encrypt_compact:
leal (%rbx,%rbx,1),%r9d
subl %r10d,%esi
subl %r11d,%edi
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %eax,%r10d
movl %ebx,%r11d
xorl %esi,%r8d
@@ -265,9 +265,9 @@ _x86_64_AES_encrypt_compact:
xorl %r8d,%eax
xorl %r9d,%ebx
- movl $2155905152,%r12d
+ movl $0x80808080,%r12d
roll $24,%eax
- movl $2155905152,%ebp
+ movl $0x80808080,%ebp
roll $24,%ebx
andl %ecx,%r12d
andl %edx,%ebp
@@ -290,10 +290,10 @@ _x86_64_AES_encrypt_compact:
xorl %r10d,%eax
xorl %r11d,%ebx
- andl $4278124286,%r8d
- andl $4278124286,%r9d
- andl $454761243,%esi
- andl $454761243,%edi
+ andl $0xfefefefe,%r8d
+ andl $0xfefefefe,%r9d
+ andl $0x1b1b1b1b,%esi
+ andl $0x1b1b1b1b,%edi
movl %ecx,%r12d
movl %edx,%ebp
xorl %esi,%r8d
@@ -345,7 +345,7 @@ asm_AES_encrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -370,7 +370,7 @@ asm_AES_encrypt:
leaq .LAES_Te+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
call _x86_64_AES_encrypt_compact
@@ -791,7 +791,7 @@ asm_AES_decrypt:
andq $-64,%rsp
subq %rsp,%rcx
negq %rcx
- andq $960,%rcx
+ andq $0x3c0,%rcx
subq %rcx,%rsp
subq $32,%rsp
@@ -816,7 +816,7 @@ asm_AES_decrypt:
leaq .LAES_Td+2048(%rip),%r14
leaq 768(%rsp),%rbp
subq %r14,%rbp
- andq $768,%rbp
+ andq $0x300,%rbp
leaq (%r14,%rbp,1),%r14
shrq $3,%rbp
addq %rbp,%r14
@@ -1334,9 +1334,9 @@ asm_AES_cbc_encrypt:
movq %r14,%r10
leaq 2304(%r14),%r11
movq %r15,%r12
- andq $4095,%r10
- andq $4095,%r11
- andq $4095,%r12
+ andq $0xFFF,%r10
+ andq $0xFFF,%r11
+ andq $0xFFF,%r12
cmpq %r11,%r12
jb .Lcbc_te_break_out
@@ -1345,7 +1345,7 @@ asm_AES_cbc_encrypt:
jmp .Lcbc_te_ok
.Lcbc_te_break_out:
subq %r10,%r12
- andq $4095,%r12
+ andq $0xFFF,%r12
addq $320,%r12
subq %r12,%r15
.align 4
@@ -1371,7 +1371,7 @@ asm_AES_cbc_encrypt:
movq %r15,%r10
subq %r14,%r10
- andq $4095,%r10
+ andq $0xfff,%r10
cmpq $2304,%r10
jb .Lcbc_do_ecopy
cmpq $4096-248,%r10
@@ -1558,7 +1558,7 @@ asm_AES_cbc_encrypt:
leaq -88-63(%rcx),%r10
subq %rbp,%r10
negq %r10
- andq $960,%r10
+ andq $0x3c0,%r10
subq %r10,%rbp
xchgq %rsp,%rbp
@@ -1587,7 +1587,7 @@ asm_AES_cbc_encrypt:
leaq 2048(%r14),%r14
leaq 768-8(%rsp),%rax
subq %r14,%rax
- andq $768,%rax
+ andq $0x300,%rax
leaq (%r14,%rax,1),%r14
cmpq $0,%rbx
diff --git a/linux-x86_64/crypto/aes/aesni-x86_64.S b/linux-x86_64/crypto/aes/aesni-x86_64.S
index 1d51d5b5..5709a2d0 100644
--- a/linux-x86_64/crypto/aes/aesni-x86_64.S
+++ b/linux-x86_64/crypto/aes/aesni-x86_64.S
@@ -508,7 +508,7 @@ aesni_ecb_encrypt:
testl %r8d,%r8d
jz .Lecb_decrypt
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_enc_tail
movdqu (%rdi),%xmm2
@@ -520,7 +520,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
@@ -548,7 +548,7 @@ aesni_ecb_encrypt:
call _aesni_encrypt8
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_enc_loop8
movups %xmm2,(%rsi)
@@ -562,22 +562,22 @@ aesni_ecb_encrypt:
movups %xmm8,96(%rsi)
movups %xmm9,112(%rsi)
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_enc_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_enc_one
movups 16(%rdi),%xmm3
je .Lecb_enc_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_enc_three
movups 48(%rdi),%xmm5
je .Lecb_enc_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_enc_five
movups 80(%rdi),%xmm7
je .Lecb_enc_six
@@ -651,7 +651,7 @@ aesni_ecb_encrypt:
.align 16
.Lecb_decrypt:
- cmpq $128,%rdx
+ cmpq $0x80,%rdx
jb .Lecb_dec_tail
movdqu (%rdi),%xmm2
@@ -663,7 +663,7 @@ aesni_ecb_encrypt:
movdqu 96(%rdi),%xmm8
movdqu 112(%rdi),%xmm9
leaq 128(%rdi),%rdi
- subq $128,%rdx
+ subq $0x80,%rdx
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
@@ -692,7 +692,7 @@ aesni_ecb_encrypt:
call _aesni_decrypt8
movups (%r11),%xmm0
- subq $128,%rdx
+ subq $0x80,%rdx
jnc .Lecb_dec_loop8
movups %xmm2,(%rsi)
@@ -714,22 +714,22 @@ aesni_ecb_encrypt:
movups %xmm9,112(%rsi)
pxor %xmm9,%xmm9
leaq 128(%rsi),%rsi
- addq $128,%rdx
+ addq $0x80,%rdx
jz .Lecb_ret
.Lecb_dec_tail:
movups (%rdi),%xmm2
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lecb_dec_one
movups 16(%rdi),%xmm3
je .Lecb_dec_two
movups 32(%rdi),%xmm4
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lecb_dec_three
movups 48(%rdi),%xmm5
je .Lecb_dec_four
movups 64(%rdi),%xmm6
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
jb .Lecb_dec_five
movups 80(%rdi),%xmm7
je .Lecb_dec_six
@@ -1607,7 +1607,7 @@ aesni_xts_encrypt:
movdqa .Lxts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -1706,7 +1706,7 @@ aesni_xts_encrypt:
.byte 102,15,56,220,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_enc_loop6
.align 32
.Lxts_enc_loop6:
@@ -1845,13 +1845,13 @@ aesni_xts_encrypt:
jz .Lxts_enc_done
pxor %xmm0,%xmm11
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lxts_enc_one
pxor %xmm0,%xmm12
je .Lxts_enc_two
pxor %xmm0,%xmm13
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lxts_enc_three
pxor %xmm0,%xmm14
je .Lxts_enc_four
@@ -2079,7 +2079,7 @@ aesni_xts_decrypt:
movdqa .Lxts_magic(%rip),%xmm8
movdqa %xmm2,%xmm15
- pshufd $95,%xmm2,%xmm9
+ pshufd $0x5f,%xmm2,%xmm9
pxor %xmm0,%xmm1
movdqa %xmm9,%xmm14
paddd %xmm9,%xmm9
@@ -2178,7 +2178,7 @@ aesni_xts_decrypt:
.byte 102,15,56,222,248
movups 64(%r11),%xmm0
movdqa %xmm8,80(%rsp)
- pshufd $95,%xmm15,%xmm9
+ pshufd $0x5f,%xmm15,%xmm9
jmp .Lxts_dec_loop6
.align 32
.Lxts_dec_loop6:
@@ -2318,13 +2318,13 @@ aesni_xts_decrypt:
jz .Lxts_dec_done
pxor %xmm0,%xmm12
- cmpq $32,%rdx
+ cmpq $0x20,%rdx
jb .Lxts_dec_one
pxor %xmm0,%xmm13
je .Lxts_dec_two
pxor %xmm0,%xmm14
- cmpq $64,%rdx
+ cmpq $0x40,%rdx
jb .Lxts_dec_three
je .Lxts_dec_four
@@ -2355,7 +2355,7 @@ aesni_xts_decrypt:
pcmpgtd %xmm15,%xmm14
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- pshufd $19,%xmm14,%xmm11
+ pshufd $0x13,%xmm14,%xmm11
andq $15,%r9
jz .Lxts_dec_ret
@@ -2645,7 +2645,7 @@ aesni_cbc_encrypt:
leaq -8(%rax),%rbp
movups (%r8),%xmm10
movl %r10d,%eax
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movups (%rcx),%xmm0
@@ -2661,14 +2661,14 @@ aesni_cbc_encrypt:
movdqu 80(%rdi),%xmm7
movdqa %xmm6,%xmm15
movl OPENSSL_ia32cap_P+4(%rip),%r9d
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
jbe .Lcbc_dec_six_or_seven
andl $71303168,%r9d
- subq $80,%rdx
+ subq $0x50,%rdx
cmpl $4194304,%r9d
je .Lcbc_dec_loop6_enter
- subq $32,%rdx
+ subq $0x20,%rdx
leaq 112(%rcx),%rcx
jmp .Lcbc_dec_loop8_enter
.align 16
@@ -2683,7 +2683,7 @@ aesni_cbc_encrypt:
movups 16-112(%rcx),%xmm1
pxor %xmm0,%xmm4
xorq %r11,%r11
- cmpq $112,%rdx
+ cmpq $0x70,%rdx
pxor %xmm0,%xmm5
pxor %xmm0,%xmm6
pxor %xmm0,%xmm7
@@ -2868,21 +2868,21 @@ aesni_cbc_encrypt:
movups %xmm8,96(%rsi)
leaq 112(%rsi),%rsi
- subq $128,%rdx
+ subq $0x80,%rdx
ja .Lcbc_dec_loop8
movaps %xmm9,%xmm2
leaq -112(%rcx),%rcx
- addq $112,%rdx
+ addq $0x70,%rdx
jle .Lcbc_dec_clear_tail_collected
movups %xmm9,(%rsi)
leaq 16(%rsi),%rsi
- cmpq $80,%rdx
+ cmpq $0x50,%rdx
jbe .Lcbc_dec_tail
movaps %xmm11,%xmm2
.Lcbc_dec_six_or_seven:
- cmpq $96,%rdx
+ cmpq $0x60,%rdx
ja .Lcbc_dec_seven
movaps %xmm7,%xmm8
@@ -2975,33 +2975,33 @@ aesni_cbc_encrypt:
movl %r10d,%eax
movdqu %xmm6,64(%rsi)
leaq 80(%rsi),%rsi
- subq $96,%rdx
+ subq $0x60,%rdx
ja .Lcbc_dec_loop6
movdqa %xmm7,%xmm2
- addq $80,%rdx
+ addq $0x50,%rdx
jle .Lcbc_dec_clear_tail_collected
movups %xmm7,(%rsi)
leaq 16(%rsi),%rsi
.Lcbc_dec_tail:
movups (%rdi),%xmm2
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_one
movups 16(%rdi),%xmm3
movaps %xmm2,%xmm11
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_two
movups 32(%rdi),%xmm4
movaps %xmm3,%xmm12
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_three
movups 48(%rdi),%xmm5
movaps %xmm4,%xmm13
- subq $16,%rdx
+ subq $0x10,%rdx
jbe .Lcbc_dec_four
movups 64(%rdi),%xmm6
@@ -3026,7 +3026,7 @@ aesni_cbc_encrypt:
movdqa %xmm6,%xmm2
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
- subq $16,%rdx
+ subq $0x10,%rdx
jmp .Lcbc_dec_tail_collected
.align 16
@@ -3345,7 +3345,7 @@ __aesni_set_encrypt_key:
pslldq $4,%xmm0
pxor %xmm3,%xmm0
- pshufd $255,%xmm0,%xmm3
+ pshufd $0xff,%xmm0,%xmm3
pxor %xmm1,%xmm3
pslldq $4,%xmm1
pxor %xmm1,%xmm3
@@ -3432,7 +3432,7 @@ __aesni_set_encrypt_key:
decl %r10d
jz .Ldone_key256
- pshufd $255,%xmm0,%xmm2
+ pshufd $0xff,%xmm0,%xmm2
pxor %xmm3,%xmm3
.byte 102,15,56,221,211
diff --git a/linux-x86_64/crypto/aes/bsaes-x86_64.S b/linux-x86_64/crypto/aes/bsaes-x86_64.S
index 8cfa4df5..c5491ce4 100644
--- a/linux-x86_64/crypto/aes/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/aes/bsaes-x86_64.S
@@ -327,45 +327,45 @@ _bsaes_encrypt8_bitslice:
pxor %xmm2,%xmm5
decl %r10d
jl .Lenc_done
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm3,%xmm9
+ pshufd $0x93,%xmm3,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm5,%xmm10
+ pshufd $0x93,%xmm5,%xmm10
pxor %xmm9,%xmm3
- pshufd $147,%xmm2,%xmm11
+ pshufd $0x93,%xmm2,%xmm11
pxor %xmm10,%xmm5
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm2
- pshufd $147,%xmm1,%xmm13
+ pshufd $0x93,%xmm1,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm1
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm2,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm5,%xmm11
- pshufd $78,%xmm2,%xmm7
+ pshufd $0x4E,%xmm2,%xmm7
pxor %xmm1,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm3,%xmm10
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm1,%xmm5
+ pshufd $0x4E,%xmm1,%xmm5
pxor %xmm11,%xmm7
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm12,%xmm8
pxor %xmm10,%xmm2
pxor %xmm14,%xmm6
@@ -799,24 +799,24 @@ _bsaes_decrypt8:
decl %r10d
jl .Ldec_done
- pshufd $78,%xmm15,%xmm7
- pshufd $78,%xmm2,%xmm13
+ pshufd $0x4E,%xmm15,%xmm7
+ pshufd $0x4E,%xmm2,%xmm13
pxor %xmm15,%xmm7
- pshufd $78,%xmm4,%xmm14
+ pshufd $0x4E,%xmm4,%xmm14
pxor %xmm2,%xmm13
- pshufd $78,%xmm0,%xmm8
+ pshufd $0x4E,%xmm0,%xmm8
pxor %xmm4,%xmm14
- pshufd $78,%xmm5,%xmm9
+ pshufd $0x4E,%xmm5,%xmm9
pxor %xmm0,%xmm8
- pshufd $78,%xmm3,%xmm10
+ pshufd $0x4E,%xmm3,%xmm10
pxor %xmm5,%xmm9
pxor %xmm13,%xmm15
pxor %xmm13,%xmm0
- pshufd $78,%xmm1,%xmm11
+ pshufd $0x4E,%xmm1,%xmm11
pxor %xmm3,%xmm10
pxor %xmm7,%xmm5
pxor %xmm8,%xmm3
- pshufd $78,%xmm6,%xmm12
+ pshufd $0x4E,%xmm6,%xmm12
pxor %xmm1,%xmm11
pxor %xmm14,%xmm0
pxor %xmm9,%xmm1
@@ -830,45 +830,45 @@ _bsaes_decrypt8:
pxor %xmm14,%xmm1
pxor %xmm14,%xmm6
pxor %xmm12,%xmm4
- pshufd $147,%xmm15,%xmm7
- pshufd $147,%xmm0,%xmm8
+ pshufd $0x93,%xmm15,%xmm7
+ pshufd $0x93,%xmm0,%xmm8
pxor %xmm7,%xmm15
- pshufd $147,%xmm5,%xmm9
+ pshufd $0x93,%xmm5,%xmm9
pxor %xmm8,%xmm0
- pshufd $147,%xmm3,%xmm10
+ pshufd $0x93,%xmm3,%xmm10
pxor %xmm9,%xmm5
- pshufd $147,%xmm1,%xmm11
+ pshufd $0x93,%xmm1,%xmm11
pxor %xmm10,%xmm3
- pshufd $147,%xmm6,%xmm12
+ pshufd $0x93,%xmm6,%xmm12
pxor %xmm11,%xmm1
- pshufd $147,%xmm2,%xmm13
+ pshufd $0x93,%xmm2,%xmm13
pxor %xmm12,%xmm6
- pshufd $147,%xmm4,%xmm14
+ pshufd $0x93,%xmm4,%xmm14
pxor %xmm13,%xmm2
pxor %xmm14,%xmm4
pxor %xmm15,%xmm8
pxor %xmm4,%xmm7
pxor %xmm4,%xmm8
- pshufd $78,%xmm15,%xmm15
+ pshufd $0x4E,%xmm15,%xmm15
pxor %xmm0,%xmm9
- pshufd $78,%xmm0,%xmm0
+ pshufd $0x4E,%xmm0,%xmm0
pxor %xmm1,%xmm12
pxor %xmm7,%xmm15
pxor %xmm6,%xmm13
pxor %xmm8,%xmm0
pxor %xmm3,%xmm11
- pshufd $78,%xmm1,%xmm7
+ pshufd $0x4E,%xmm1,%xmm7
pxor %xmm2,%xmm14
- pshufd $78,%xmm6,%xmm8
+ pshufd $0x4E,%xmm6,%xmm8
pxor %xmm5,%xmm10
- pshufd $78,%xmm3,%xmm1
+ pshufd $0x4E,%xmm3,%xmm1
pxor %xmm4,%xmm10
- pshufd $78,%xmm4,%xmm6
+ pshufd $0x4E,%xmm4,%xmm6
pxor %xmm4,%xmm11
- pshufd $78,%xmm2,%xmm3
+ pshufd $0x4E,%xmm2,%xmm3
pxor %xmm11,%xmm7
- pshufd $78,%xmm5,%xmm2
+ pshufd $0x4E,%xmm5,%xmm2
pxor %xmm12,%xmm8
pxor %xmm1,%xmm10
pxor %xmm14,%xmm6
@@ -1559,20 +1559,20 @@ bsaes_xts_encrypt:
movdqa %xmm7,(%rax)
andq $-16,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc .Lxts_enc_short
jmp .Lxts_enc_loop
.align 16
.Lxts_enc_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1580,7 +1580,7 @@ bsaes_xts_encrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1589,7 +1589,7 @@ bsaes_xts_encrypt:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1599,7 +1599,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1609,7 +1609,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1619,7 +1619,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1629,7 +1629,7 @@ bsaes_xts_encrypt:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -1673,20 +1673,20 @@ bsaes_xts_encrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc .Lxts_enc_loop
.Lxts_enc_short:
- addq $128,%r14
+ addq $0x80,%r14
jz .Lxts_enc_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -1694,7 +1694,7 @@ bsaes_xts_encrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -1705,7 +1705,7 @@ bsaes_xts_encrypt:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je .Lxts_enc_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -1717,7 +1717,7 @@ bsaes_xts_encrypt:
cmpq $32,%r14
je .Lxts_enc_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -1729,7 +1729,7 @@ bsaes_xts_encrypt:
cmpq $48,%r14
je .Lxts_enc_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -1741,7 +1741,7 @@ bsaes_xts_encrypt:
cmpq $64,%r14
je .Lxts_enc_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -1753,7 +1753,7 @@ bsaes_xts_encrypt:
cmpq $80,%r14
je .Lxts_enc_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2019,20 +2019,20 @@ bsaes_xts_decrypt:
shlq $4,%rax
subq %rax,%r14
- subq $128,%rsp
+ subq $0x80,%rsp
movdqa 32(%rbp),%xmm6
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- subq $128,%r14
+ subq $0x80,%r14
jc .Lxts_dec_short
jmp .Lxts_dec_loop
.align 16
.Lxts_dec_loop:
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2040,7 +2040,7 @@ bsaes_xts_decrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2049,7 +2049,7 @@ bsaes_xts_decrypt:
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
movdqu 0(%r12),%xmm7
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2059,7 +2059,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 16(%r12),%xmm8
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2069,7 +2069,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 32(%r12),%xmm9
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2079,7 +2079,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 48(%r12),%xmm10
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2089,7 +2089,7 @@ bsaes_xts_decrypt:
pxor %xmm13,%xmm6
movdqu 64(%r12),%xmm11
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2133,20 +2133,20 @@ bsaes_xts_decrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- subq $128,%r14
+ subq $0x80,%r14
jnc .Lxts_dec_loop
.Lxts_dec_short:
- addq $128,%r14
+ addq $0x80,%r14
jz .Lxts_dec_done
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm15
movdqa %xmm6,0(%rsp)
@@ -2154,7 +2154,7 @@ bsaes_xts_decrypt:
pand %xmm12,%xmm13
pcmpgtd %xmm6,%xmm14
pxor %xmm13,%xmm6
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm0
movdqa %xmm6,16(%rsp)
@@ -2165,7 +2165,7 @@ bsaes_xts_decrypt:
movdqu 0(%r12),%xmm7
cmpq $16,%r14
je .Lxts_dec_1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm1
movdqa %xmm6,32(%rsp)
@@ -2177,7 +2177,7 @@ bsaes_xts_decrypt:
cmpq $32,%r14
je .Lxts_dec_2
pxor %xmm7,%xmm15
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm2
movdqa %xmm6,48(%rsp)
@@ -2189,7 +2189,7 @@ bsaes_xts_decrypt:
cmpq $48,%r14
je .Lxts_dec_3
pxor %xmm8,%xmm0
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm3
movdqa %xmm6,64(%rsp)
@@ -2201,7 +2201,7 @@ bsaes_xts_decrypt:
cmpq $64,%r14
je .Lxts_dec_4
pxor %xmm9,%xmm1
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm4
movdqa %xmm6,80(%rsp)
@@ -2213,7 +2213,7 @@ bsaes_xts_decrypt:
cmpq $80,%r14
je .Lxts_dec_5
pxor %xmm10,%xmm2
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
pxor %xmm14,%xmm14
movdqa %xmm6,%xmm5
movdqa %xmm6,96(%rsp)
@@ -2390,7 +2390,7 @@ bsaes_xts_decrypt:
pxor %xmm14,%xmm14
movdqa .Lxts_magic(%rip),%xmm12
pcmpgtd %xmm6,%xmm14
- pshufd $19,%xmm14,%xmm13
+ pshufd $0x13,%xmm14,%xmm13
movdqa %xmm6,%xmm5
paddq %xmm6,%xmm6
pand %xmm12,%xmm13
diff --git a/linux-x86_64/crypto/aes/vpaes-x86_64.S b/linux-x86_64/crypto/aes/vpaes-x86_64.S
index 1d124246..4dfafa97 100644
--- a/linux-x86_64/crypto/aes/vpaes-x86_64.S
+++ b/linux-x86_64/crypto/aes/vpaes-x86_64.S
@@ -61,7 +61,7 @@ _vpaes_encrypt_core:
addq $16,%r11
pxor %xmm0,%xmm3
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
subq $1,%rax
pxor %xmm3,%xmm0
@@ -121,10 +121,10 @@ _vpaes_decrypt_core:
pand %xmm9,%xmm0
.byte 102,15,56,0,208
movdqa .Lk_dipt+16(%rip),%xmm0
- xorq $48,%r11
+ xorq $0x30,%r11
leaq .Lk_dsbd(%rip),%r10
.byte 102,15,56,0,193
- andq $48,%r11
+ andq $0x30,%r11
pxor %xmm5,%xmm2
movdqa .Lk_mc_forward+48(%rip),%xmm5
pxor %xmm2,%xmm0
@@ -243,7 +243,7 @@ _vpaes_schedule_core:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
movdqu %xmm3,(%rdx)
- xorq $48,%r8
+ xorq $0x30,%r8
.Lschedule_go:
cmpl $192,%esi
@@ -333,7 +333,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_mangle
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
movdqa %xmm7,%xmm5
movdqa %xmm6,%xmm7
call _vpaes_schedule_low_round
@@ -400,8 +400,8 @@ _vpaes_schedule_core:
.type _vpaes_schedule_192_smear,@function
.align 16
_vpaes_schedule_192_smear:
- pshufd $128,%xmm6,%xmm1
- pshufd $254,%xmm7,%xmm0
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
pxor %xmm1,%xmm6
pxor %xmm1,%xmm1
pxor %xmm0,%xmm6
@@ -438,7 +438,7 @@ _vpaes_schedule_round:
pxor %xmm1,%xmm7
- pshufd $255,%xmm0,%xmm0
+ pshufd $0xFF,%xmm0,%xmm0
.byte 102,15,58,15,192,1
@@ -597,7 +597,7 @@ _vpaes_schedule_mangle:
movdqa (%r8,%r10,1),%xmm1
.byte 102,15,56,0,217
addq $-16,%r8
- andq $48,%r8
+ andq $0x30,%r8
movdqu %xmm3,(%rdx)
.byte 0xf3,0xc3
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
@@ -616,7 +616,7 @@ vpaes_set_encrypt_key:
movl %eax,240(%rdx)
movl $0,%ecx
- movl $48,%r8d
+ movl $0x30,%r8d
call _vpaes_schedule_core
xorl %eax,%eax
.byte 0xf3,0xc3
diff --git a/linux-x86_64/crypto/bn/rsaz-x86_64.S b/linux-x86_64/crypto/bn/rsaz-x86_64.S
index dd3d3106..21531d1c 100644
--- a/linux-x86_64/crypto/bn/rsaz-x86_64.S
+++ b/linux-x86_64/crypto/bn/rsaz-x86_64.S
@@ -466,48 +466,94 @@ rsaz_512_mul_gather4:
pushq %r14
pushq %r15
- movl %r9d,%r9d
- subq $128+24,%rsp
+ subq $152,%rsp
.Lmul_gather4_body:
- movl 64(%rdx,%r9,4),%eax
-.byte 102,72,15,110,199
- movl (%rdx,%r9,4),%ebx
-.byte 102,72,15,110,201
+ movd %r9d,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
+
+ movdqa 0(%rdx),%xmm8
+ movdqa 16(%rdx),%xmm9
+ movdqa 32(%rdx),%xmm10
+ movdqa 48(%rdx),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rdx),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rdx),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rdx),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rdx),%xmm15
+ leaq 128(%rdx),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
- shlq $32,%rax
- orq %rax,%rbx
movq (%rsi),%rax
movq 8(%rsi),%rcx
- leaq 128(%rdx,%r9,4),%rbp
mulq %rbx
movq %rax,(%rsp)
movq %rcx,%rax
movq %rdx,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r8
movq 16(%rsi),%rax
movq %rdx,%r9
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r9
movq 24(%rsi),%rax
movq %rdx,%r10
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r10
movq 32(%rsi),%rax
movq %rdx,%r11
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r11
movq 40(%rsi),%rax
movq %rdx,%r12
@@ -520,14 +566,12 @@ rsaz_512_mul_gather4:
adcq $0,%r13
mulq %rbx
- leaq 128(%rbp),%rbp
addq %rax,%r13
movq 56(%rsi),%rax
movq %rdx,%r14
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r14
movq (%rsi),%rax
movq %rdx,%r15
@@ -539,6 +583,35 @@ rsaz_512_mul_gather4:
.align 32
.Loop_mul_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,195
+
mulq %rbx
addq %rax,%r8
movq 8(%rsi),%rax
@@ -547,7 +620,6 @@ rsaz_512_mul_gather4:
adcq $0,%r8
mulq %rbx
- movd (%rbp),%xmm4
addq %rax,%r9
movq 16(%rsi),%rax
adcq $0,%rdx
@@ -556,7 +628,6 @@ rsaz_512_mul_gather4:
adcq $0,%r9
mulq %rbx
- movd 64(%rbp),%xmm5
addq %rax,%r10
movq 24(%rsi),%rax
adcq $0,%rdx
@@ -565,7 +636,6 @@ rsaz_512_mul_gather4:
adcq $0,%r10
mulq %rbx
- pslldq $4,%xmm5
addq %rax,%r11
movq 32(%rsi),%rax
adcq $0,%rdx
@@ -574,7 +644,6 @@ rsaz_512_mul_gather4:
adcq $0,%r11
mulq %rbx
- por %xmm5,%xmm4
addq %rax,%r12
movq 40(%rsi),%rax
adcq $0,%rdx
@@ -599,7 +668,6 @@ rsaz_512_mul_gather4:
adcq $0,%r14
mulq %rbx
-.byte 102,72,15,126,227
addq %rax,%r15
movq (%rsi),%rax
adcq $0,%rdx
@@ -607,7 +675,6 @@ rsaz_512_mul_gather4:
movq %rdx,%r15
adcq $0,%r15
- leaq 128(%rbp),%rbp
leaq 8(%rdi),%rdi
decl %ecx
@@ -622,8 +689,8 @@ rsaz_512_mul_gather4:
movq %r14,48(%rdi)
movq %r15,56(%rdi)
-.byte 102,72,15,126,199
-.byte 102,72,15,126,205
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
@@ -673,7 +740,7 @@ rsaz_512_mul_scatter4:
movl %r9d,%r9d
subq $128+24,%rsp
.Lmul_scatter4_body:
- leaq (%r8,%r9,4),%r8
+ leaq (%r8,%r9,8),%r8
.byte 102,72,15,110,199
.byte 102,72,15,110,202
.byte 102,73,15,110,208
@@ -709,30 +776,14 @@ rsaz_512_mul_scatter4:
call __rsaz_512_subtract
- movl %r8d,0(%rsi)
- shrq $32,%r8
- movl %r9d,128(%rsi)
- shrq $32,%r9
- movl %r10d,256(%rsi)
- shrq $32,%r10
- movl %r11d,384(%rsi)
- shrq $32,%r11
- movl %r12d,512(%rsi)
- shrq $32,%r12
- movl %r13d,640(%rsi)
- shrq $32,%r13
- movl %r14d,768(%rsi)
- shrq $32,%r14
- movl %r15d,896(%rsi)
- shrq $32,%r15
- movl %r8d,64(%rsi)
- movl %r9d,192(%rsi)
- movl %r10d,320(%rsi)
- movl %r11d,448(%rsi)
- movl %r12d,576(%rsi)
- movl %r13d,704(%rsi)
- movl %r14d,832(%rsi)
- movl %r15d,960(%rsi)
+ movq %r8,0(%rsi)
+ movq %r9,128(%rsi)
+ movq %r10,256(%rsi)
+ movq %r11,384(%rsi)
+ movq %r12,512(%rsi)
+ movq %r13,640(%rsi)
+ movq %r14,768(%rsi)
+ movq %r15,896(%rsi)
leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15
@@ -1087,16 +1138,14 @@ __rsaz_512_mul:
.type rsaz_512_scatter4,@function
.align 16
rsaz_512_scatter4:
- leaq (%rdi,%rdx,4),%rdi
+ leaq (%rdi,%rdx,8),%rdi
movl $8,%r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq (%rsi),%rax
leaq 8(%rsi),%rsi
- movl %eax,(%rdi)
- shrq $32,%rax
- movl %eax,64(%rdi)
+ movq %rax,(%rdi)
leaq 128(%rdi),%rdi
decl %r9d
jnz .Loop_scatter
@@ -1108,20 +1157,73 @@ rsaz_512_scatter4:
.type rsaz_512_gather4,@function
.align 16
rsaz_512_gather4:
- leaq (%rsi,%rdx,4),%rsi
+ movd %edx,%xmm8
+ movdqa .Linc+16(%rip),%xmm1
+ movdqa .Linc(%rip),%xmm0
+
+ pshufd $0,%xmm8,%xmm8
+ movdqa %xmm1,%xmm7
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm8,%xmm0
+ movdqa %xmm7,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm8,%xmm1
+ movdqa %xmm7,%xmm4
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm8,%xmm2
+ movdqa %xmm7,%xmm5
+ paddd %xmm3,%xmm4
+ pcmpeqd %xmm8,%xmm3
+ movdqa %xmm7,%xmm6
+ paddd %xmm4,%xmm5
+ pcmpeqd %xmm8,%xmm4
+ paddd %xmm5,%xmm6
+ pcmpeqd %xmm8,%xmm5
+ paddd %xmm6,%xmm7
+ pcmpeqd %xmm8,%xmm6
+ pcmpeqd %xmm8,%xmm7
movl $8,%r9d
jmp .Loop_gather
.align 16
.Loop_gather:
- movl (%rsi),%eax
- movl 64(%rsi),%r8d
+ movdqa 0(%rsi),%xmm8
+ movdqa 16(%rsi),%xmm9
+ movdqa 32(%rsi),%xmm10
+ movdqa 48(%rsi),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rsi),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rsi),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rsi),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rsi),%xmm15
leaq 128(%rsi),%rsi
- shlq $32,%r8
- orq %r8,%rax
- movq %rax,(%rdi)
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+ movq %xmm8,(%rdi)
leaq 8(%rdi),%rdi
decl %r9d
jnz .Loop_gather
.byte 0xf3,0xc3
+.LSEH_end_rsaz_512_gather4:
.size rsaz_512_gather4,.-rsaz_512_gather4
+
+.align 64
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
#endif
diff --git a/linux-x86_64/crypto/bn/x86_64-mont.S b/linux-x86_64/crypto/bn/x86_64-mont.S
index 4d401c67..83926ad7 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont.S
@@ -636,20 +636,20 @@ bn_sqr8x_mont:
- leaq -64(%rsp,%r9,4),%r11
+ leaq -64(%rsp,%r9,2),%r11
movq (%r8),%r8
subq %rsi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lsqr8x_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,4),%rsp
+ leaq -64(%rsp,%r9,2),%rsp
jmp .Lsqr8x_sp_done
.align 32
.Lsqr8x_sp_alt:
- leaq 4096-64(,%r9,4),%r10
- leaq -64(%rsp,%r9,4),%rsp
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -659,58 +659,80 @@ bn_sqr8x_mont:
movq %r9,%r10
negq %r9
- leaq 64(%rsp,%r9,2),%r11
movq %r8,32(%rsp)
movq %rax,40(%rsp)
.Lsqr8x_body:
- movq %r9,%rbp
-.byte 102,73,15,110,211
- shrq $3+2,%rbp
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- jmp .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
- movq 0(%rcx),%xmm0
- movq 8(%rcx),%xmm1
- movq 16(%rcx),%xmm3
- movq 24(%rcx),%xmm4
- leaq 32(%rcx),%rcx
- movdqa %xmm0,0(%r11)
- movdqa %xmm1,16(%r11)
- movdqa %xmm3,32(%r11)
- movdqa %xmm4,48(%r11)
- leaq 64(%r11),%r11
- decq %rbp
- jnz .Lsqr8x_copy_n
-
+.byte 102,72,15,110,209
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
call bn_sqr8x_internal
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz .Lsqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
pxor %xmm0,%xmm0
- leaq 48(%rsp),%rax
- leaq 64(%rsp,%r9,2),%rdx
- shrq $3+2,%r9
+ pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
- jmp .Lsqr8x_zero
+ jmp .Lsqr8x_cond_copy
.align 32
-.Lsqr8x_zero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- movdqa %xmm0,32(%rax)
- movdqa %xmm0,48(%rax)
- leaq 64(%rax),%rax
- movdqa %xmm0,0(%rdx)
- movdqa %xmm0,16(%rdx)
- movdqa %xmm0,32(%rdx)
- movdqa %xmm0,48(%rdx)
- leaq 64(%rdx),%rdx
- decq %r9
- jnz .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz .Lsqr8x_cond_copy
movq $1,%rax
movq -48(%rsi),%r15
diff --git a/linux-x86_64/crypto/bn/x86_64-mont5.S b/linux-x86_64/crypto/bn/x86_64-mont5.S
index 214064e6..554df1ff 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont5.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont5.S
@@ -17,46 +17,151 @@ bn_mul_mont_gather5:
.Lmul_enter:
movl %r9d,%r9d
movq %rsp,%rax
- movl 8(%rsp),%r10d
+ movd 8(%rsp),%xmm5
+ leaq .Linc(%rip),%r10
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
+
leaq 2(%r9),%r11
negq %r11
- leaq (%rsp,%r11,8),%rsp
+ leaq -264(%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
- movq %rdx,%r12
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%r12,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
-
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
.byte 102,72,15,126,195
movq (%r8),%r8
@@ -65,29 +170,14 @@ bn_mul_mont_gather5:
xorq %r14,%r14
xorq %r15,%r15
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -120,14 +210,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .L1st
-.byte 102,72,15,126,195
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
movq %r10,%r11
@@ -141,33 +229,78 @@ bn_mul_mont_gather5:
jmp .Louter
.align 16
.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
-
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-
imulq %r10,%rbp
movq %rdx,%r11
- por %xmm2,%xmm0
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
@@ -203,15 +336,12 @@ bn_mul_mont_gather5:
cmpq %r9,%r15
jne .Linner
-.byte 102,72,15,126,195
-
addq %rax,%r13
- movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
- movq (%rsp,%r15,8),%r10
+ movq (%rsp,%r9,8),%r10
adcq $0,%rdx
- movq %r13,-16(%rsp,%r15,8)
+ movq %r13,-16(%rsp,%r9,8)
movq %rdx,%r13
xorq %rdx,%rdx
@@ -257,6 +387,7 @@ bn_mul_mont_gather5:
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -279,10 +410,10 @@ bn_mul4x_mont_gather5:
pushq %r13
pushq %r14
pushq %r15
+
.byte 0x67
- movl %r9d,%r10d
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
@@ -292,19 +423,21 @@ bn_mul4x_mont_gather5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lmul4xsp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -320,6 +453,7 @@ bn_mul4x_mont_gather5:
movq 40(%rsp),%rsi
movq $1,%rax
+
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
@@ -335,47 +469,141 @@ bn_mul4x_mont_gather5:
.align 32
mul4x_internal:
shlq $5,%r9
- movl 8(%rax),%r10d
- leaq 256(%rdx,%r9,1),%r13
+ movd 8(%rax),%xmm5
+ leaq .Linc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
shrq $5,%r9
- movq %r10,%r11
- shrq $3,%r10
- andq $7,%r11
- notq %r10
- leaq .Lmagic_masks(%rip),%rax
- andq $3,%r10
- leaq 96(%rdx,%r11,8),%r12
- movq 0(%rax,%r10,8),%xmm4
- movq 8(%rax,%r10,8),%xmm5
- addq $7,%r11
- movq 16(%rax,%r10,8),%xmm6
- movq 24(%rax,%r10,8),%xmm7
- andq $7,%r11
-
- movq -96(%r12),%xmm0
- leaq 256(%r12),%r14
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
- pand %xmm6,%xmm2
-.byte 0x67
- por %xmm1,%xmm0
- movq -96(%r14),%xmm1
-.byte 0x67
- pand %xmm7,%xmm3
-.byte 0x67
- por %xmm2,%xmm0
- movq -32(%r14),%xmm2
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
.byte 0x67
- pand %xmm4,%xmm1
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
.byte 0x67
- por %xmm3,%xmm0
- movq 32(%r14),%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
.byte 102,72,15,126,195
- movq 96(%r14),%xmm0
+
movq %r13,16+8(%rsp)
movq %rdi,56+8(%rsp)
@@ -389,26 +617,10 @@ mul4x_internal:
movq %rax,%r10
movq (%rcx),%rax
- pand %xmm5,%xmm2
- pand %xmm6,%xmm3
- por %xmm2,%xmm1
-
imulq %r10,%rbp
-
-
-
-
-
-
-
- leaq 64+8(%rsp,%r11,8),%r14
+ leaq 64+8(%rsp),%r14
movq %rdx,%r11
- pand %xmm7,%xmm0
- por %xmm3,%xmm1
- leaq 512(%r12),%r12
- por %xmm1,%xmm0
-
mulq %rbp
addq %rax,%r10
movq 8(%rsi,%r9,1),%rax
@@ -417,7 +629,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -427,7 +639,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -437,7 +649,7 @@ mul4x_internal:
.L1st4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -453,7 +665,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -483,7 +695,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -492,7 +704,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdi,(%r14)
movq %rdx,%r13
@@ -502,7 +714,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
leaq 32(%r14),%r14
adcq $0,%rdx
movq %rdx,%r11
@@ -518,7 +730,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
@@ -531,8 +743,7 @@ mul4x_internal:
movq %rdi,-16(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -543,6 +754,63 @@ mul4x_internal:
.align 32
.Louter4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
movq (%r14,%r9,1),%r10
movq %r8,%rbp
mulq %rbx
@@ -550,25 +818,11 @@ mul4x_internal:
movq (%rcx),%rax
adcq $0,%rdx
- movq -96(%r12),%xmm0
- movq -32(%r12),%xmm1
- pand %xmm4,%xmm0
- movq 32(%r12),%xmm2
- pand %xmm5,%xmm1
- movq 96(%r12),%xmm3
-
imulq %r10,%rbp
-.byte 0x67
movq %rdx,%r11
movq %rdi,(%r14)
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
- por %xmm2,%xmm0
leaq (%r14,%r9,1),%r14
- leaq 256(%r12),%r12
- por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
@@ -578,7 +832,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -590,7 +844,7 @@ mul4x_internal:
adcq $0,%rdx
addq %r11,%rdi
leaq 32(%r9),%r15
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %rdx,%r13
jmp .Linner4x
@@ -599,7 +853,7 @@ mul4x_internal:
.Linner4x:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -617,7 +871,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq -16(%rcx),%rax
+ movq -8(%rcx),%rax
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -651,7 +905,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
- movq 16(%rcx),%rax
+ movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%r14),%r11
adcq $0,%rdx
@@ -662,7 +916,7 @@ mul4x_internal:
movq 16(%rsi,%r15,1),%rax
adcq $0,%rdx
addq %r11,%rdi
- leaq 64(%rcx),%rcx
+ leaq 32(%rcx),%rcx
adcq $0,%rdx
movq %r13,-8(%r14)
movq %rdx,%r13
@@ -672,7 +926,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r10
- movq -32(%rcx),%rax
+ movq -16(%rcx),%rax
adcq $0,%rdx
addq 16(%r14),%r10
leaq 32(%r14),%r14
@@ -691,7 +945,7 @@ mul4x_internal:
mulq %rbx
addq %rax,%r11
movq %rbp,%rax
- movq -16(%rcx),%rbp
+ movq -8(%rcx),%rbp
adcq $0,%rdx
addq -8(%r14),%r11
adcq $0,%rdx
@@ -706,9 +960,8 @@ mul4x_internal:
movq %r13,-24(%r14)
movq %rdx,%r13
-.byte 102,72,15,126,195
movq %rdi,-16(%r14)
- leaq (%rcx,%r9,2),%rcx
+ leaq (%rcx,%r9,1),%rcx
xorq %rdi,%rdi
addq %r10,%r13
@@ -719,16 +972,23 @@ mul4x_internal:
cmpq 16+8(%rsp),%r12
jb .Louter4x
+ xorq %rax,%rax
subq %r13,%rbp
adcq %r15,%r15
orq %r15,%rdi
- xorq $1,%rdi
+ subq %rdi,%rax
leaq (%r14,%r9,1),%rbx
- leaq (%rcx,%rdi,8),%rbp
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
movq %r9,%rcx
sarq $3+2,%rcx
movq 56+8(%rsp),%rdi
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
.size mul4x_internal,.-mul4x_internal
.globl bn_power5
.hidden bn_power5
@@ -742,9 +1002,9 @@ bn_power5:
pushq %r13
pushq %r14
pushq %r15
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leal (%r9,%r9,2),%r10d
negq %r9
movq (%r8),%r8
@@ -754,19 +1014,20 @@ bn_power5:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lpwr_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -794,10 +1055,15 @@ bn_power5:
.byte 102,72,15,110,226
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
call __bn_sqr8x_internal
+ call __bn_post4x_internal
.byte 102,72,15,126,209
.byte 102,72,15,126,226
@@ -1342,9 +1608,9 @@ __bn_sqr8x_internal:
movq %rbx,-16(%rdi)
movq %r8,-8(%rdi)
.byte 102,72,15,126,213
-sqr8x_reduction:
+__bn_sqr8x_reduction:
xorq %rax,%rax
- leaq (%rbp,%r9,2),%rcx
+ leaq (%r9,%rbp,1),%rcx
leaq 48+8(%rsp,%r9,2),%rdx
movq %rcx,0+8(%rsp)
leaq 48+8(%rsp,%r9,1),%rdi
@@ -1377,14 +1643,14 @@ sqr8x_reduction:
.align 32
.L8x_reduce:
mulq %rbx
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
negq %r8
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
movq %rbx,48-8+8(%rsp,%rcx,8)
@@ -1393,7 +1659,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq 32+8(%rsp),%rsi
@@ -1402,7 +1668,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
imulq %r8,%rsi
addq %r11,%r10
@@ -1411,7 +1677,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1419,7 +1685,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1427,7 +1693,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1445,7 +1711,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_reduce
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
xorq %rax,%rax
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
@@ -1471,14 +1737,14 @@ sqr8x_reduction:
.L8x_tail:
mulq %rbx
addq %rax,%r8
- movq 16(%rbp),%rax
+ movq 8(%rbp),%rax
movq %r8,(%rdi)
movq %rdx,%r8
adcq $0,%r8
mulq %rbx
addq %rax,%r9
- movq 32(%rbp),%rax
+ movq 16(%rbp),%rax
adcq $0,%rdx
addq %r9,%r8
leaq 8(%rdi),%rdi
@@ -1487,7 +1753,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r10
- movq 48(%rbp),%rax
+ movq 24(%rbp),%rax
adcq $0,%rdx
addq %r10,%r9
movq %rdx,%r10
@@ -1495,7 +1761,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r11
- movq 64(%rbp),%rax
+ movq 32(%rbp),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
@@ -1503,7 +1769,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r12
- movq 80(%rbp),%rax
+ movq 40(%rbp),%rax
adcq $0,%rdx
addq %r12,%r11
movq %rdx,%r12
@@ -1511,7 +1777,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r13
- movq 96(%rbp),%rax
+ movq 48(%rbp),%rax
adcq $0,%rdx
addq %r13,%r12
movq %rdx,%r13
@@ -1519,7 +1785,7 @@ sqr8x_reduction:
mulq %rbx
addq %rax,%r14
- movq 112(%rbp),%rax
+ movq 56(%rbp),%rax
adcq $0,%rdx
addq %r14,%r13
movq %rdx,%r14
@@ -1537,7 +1803,7 @@ sqr8x_reduction:
decl %ecx
jnz .L8x_tail
- leaq 128(%rbp),%rbp
+ leaq 64(%rbp),%rbp
movq 8+8(%rsp),%rdx
cmpq 0+8(%rsp),%rbp
jae .L8x_tail_done
@@ -1583,7 +1849,7 @@ sqr8x_reduction:
adcq 48(%rdi),%r14
adcq 56(%rdi),%r15
adcq $0,%rax
- movq -16(%rbp),%rcx
+ movq -8(%rbp),%rcx
xorq %rsi,%rsi
.byte 102,72,15,126,213
@@ -1601,44 +1867,62 @@ sqr8x_reduction:
cmpq %rdx,%rdi
jb .L8x_reduction_loop
-
- subq %r15,%rcx
+ .byte 0xf3,0xc3
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.type __bn_post4x_internal,@function
+.align 32
+__bn_post4x_internal:
+ movq 0(%rbp),%r12
leaq (%rdi,%r9,1),%rbx
- adcq %rsi,%rsi
movq %r9,%rcx
- orq %rsi,%rax
.byte 102,72,15,126,207
- xorq $1,%rax
+ negq %rax
.byte 102,72,15,126,206
- leaq (%rbp,%rax,8),%rbp
sarq $3+2,%rcx
- jmp .Lsqr4x_sub
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
-.align 32
+.align 16
.Lsqr4x_sub:
-.byte 0x66
- movq 0(%rbx),%r12
- movq 8(%rbx),%r13
- sbbq 0(%rbp),%r12
- movq 16(%rbx),%r14
- sbbq 16(%rbp),%r13
- movq 24(%rbx),%r15
- leaq 32(%rbx),%rbx
- sbbq 32(%rbp),%r14
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
movq %r12,0(%rdi)
- sbbq 48(%rbp),%r15
- leaq 64(%rbp),%rbp
+ leaq 32(%rbx),%rbx
movq %r13,8(%rdi)
+ sbbq %r10,%r10
movq %r14,16(%rdi)
movq %r15,24(%rdi)
leaq 32(%rdi),%rdi
incq %rcx
jnz .Lsqr4x_sub
+
movq %r9,%r10
negq %r9
.byte 0xf3,0xc3
-.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.size __bn_post4x_internal,.-__bn_post4x_internal
.globl bn_from_montgomery
.hidden bn_from_montgomery
.type bn_from_montgomery,@function
@@ -1661,10 +1945,9 @@ bn_from_mont8x:
pushq %r13
pushq %r14
pushq %r15
-.byte 0x67
- movl %r9d,%r10d
+
shll $3,%r9d
- shll $3+2,%r10d
+ leaq (%r9,%r9,2),%r10
negq %r9
movq (%r8),%r8
@@ -1674,19 +1957,20 @@ bn_from_mont8x:
- leaq -64(%rsp,%r9,2),%r11
- subq %rsi,%r11
+
+ leaq -320(%rsp,%r9,2),%r11
+ subq %rdi,%r11
andq $4095,%r11
cmpq %r11,%r10
jb .Lfrom_sp_alt
subq %r11,%rsp
- leaq -64(%rsp,%r9,2),%rsp
+ leaq -320(%rsp,%r9,2),%rsp
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
- leaq 4096-64(,%r9,2),%r10
- leaq -64(%rsp,%r9,2),%rsp
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rsp,%r9,2),%rsp
subq %r10,%r11
movq $0,%r10
cmovcq %r10,%r11
@@ -1737,7 +2021,8 @@ bn_from_mont8x:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
- call sqr8x_reduction
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
pxor %xmm0,%xmm0
leaq 48(%rsp),%rax
@@ -1787,46 +2072,170 @@ bn_scatter5:
.globl bn_gather5
.hidden bn_gather5
.type bn_gather5,@function
-.align 16
+.align 32
bn_gather5:
- movl %ecx,%r11d
- shrl $3,%ecx
- andq $7,%r11
- notl %ecx
- leaq .Lmagic_masks(%rip),%rax
- andl $3,%ecx
- leaq 128(%rdx,%r11,8),%rdx
- movq 0(%rax,%rcx,8),%xmm4
- movq 8(%rax,%rcx,8),%xmm5
- movq 16(%rax,%rcx,8),%xmm6
- movq 24(%rax,%rcx,8),%xmm7
+.LSEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq .Linc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
jmp .Lgather
-.align 16
-.Lgather:
- movq -128(%rdx),%xmm0
- movq -64(%rdx),%xmm1
- pand %xmm4,%xmm0
- movq 0(%rdx),%xmm2
- pand %xmm5,%xmm1
- movq 64(%rdx),%xmm3
- pand %xmm6,%xmm2
- por %xmm1,%xmm0
- pand %xmm7,%xmm3
-.byte 0x67,0x67
- por %xmm2,%xmm0
- leaq 256(%rdx),%rdx
- por %xmm3,%xmm0
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subl $1,%esi
jnz .Lgather
+
+ leaq (%r10),%rsp
.byte 0xf3,0xc3
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
.align 64
-.Lmagic_masks:
-.long 0,0, 0,0, 0,0, -1,-1
-.long 0,0, 0,0, 0,0, 0,0
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
#endif
diff --git a/linux-x86_64/crypto/chacha/chacha-x86_64.S b/linux-x86_64/crypto/chacha/chacha-x86_64.S
new file mode 100644
index 00000000..e994940a
--- /dev/null
+++ b/linux-x86_64/crypto/chacha/chacha-x86_64.S
@@ -0,0 +1,1585 @@
+#if defined(__x86_64__)
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.align 64
+.Lzero:
+.long 0,0,0,0
+.Lone:
+.long 1,0,0,0
+.Linc:
+.long 0,1,2,3
+.Lfour:
+.long 4,4,4,4
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.globl ChaCha20_ctr32
+.hidden ChaCha20_ctr32
+.type ChaCha20_ctr32,@function
+.align 64
+ChaCha20_ctr32:
+ cmpq $0,%rdx
+ je .Lno_data
+ movq OPENSSL_ia32cap_P+4(%rip),%r10
+ testl $512,%r10d
+ jnz .LChaCha20_ssse3
+
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ subq $64+24,%rsp
+
+
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lone(%rip),%xmm4
+
+
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq %rdx,%rbp
+ jmp .Loop_outer
+
+.align 32
+.Loop_outer:
+ movl $0x61707865,%eax
+ movl $0x3320646e,%ebx
+ movl $0x79622d32,%ecx
+ movl $0x6b206574,%edx
+ movl 16(%rsp),%r8d
+ movl 20(%rsp),%r9d
+ movl 24(%rsp),%r10d
+ movl 28(%rsp),%r11d
+ movd %xmm3,%r12d
+ movl 52(%rsp),%r13d
+ movl 56(%rsp),%r14d
+ movl 60(%rsp),%r15d
+
+ movq %rbp,64+0(%rsp)
+ movl $10,%ebp
+ movq %rsi,64+8(%rsp)
+.byte 102,72,15,126,214
+ movq %rdi,64+16(%rsp)
+ movq %rsi,%rdi
+ shrq $32,%rdi
+ jmp .Loop
+
+.align 32
+.Loop:
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $16,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $16,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $12,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $12,%r9d
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $8,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $8,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $7,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $7,%r9d
+ movl %esi,32(%rsp)
+ movl %edi,36(%rsp)
+ movl 40(%rsp),%esi
+ movl 44(%rsp),%edi
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $16,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $16,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $12,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $12,%r11d
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $8,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $8,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $7,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $7,%r11d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $16,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $16,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $12,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $12,%r10d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $8,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $8,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $7,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $7,%r10d
+ movl %esi,40(%rsp)
+ movl %edi,44(%rsp)
+ movl 32(%rsp),%esi
+ movl 36(%rsp),%edi
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $16,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $16,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $12,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $12,%r8d
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $8,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $8,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $7,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $7,%r8d
+ decl %ebp
+ jnz .Loop
+ movl %edi,36(%rsp)
+ movl %esi,32(%rsp)
+ movq 64(%rsp),%rbp
+ movdqa %xmm2,%xmm1
+ movq 64+8(%rsp),%rsi
+ paddd %xmm4,%xmm3
+ movq 64+16(%rsp),%rdi
+
+ addl $0x61707865,%eax
+ addl $0x3320646e,%ebx
+ addl $0x79622d32,%ecx
+ addl $0x6b206574,%edx
+ addl 16(%rsp),%r8d
+ addl 20(%rsp),%r9d
+ addl 24(%rsp),%r10d
+ addl 28(%rsp),%r11d
+ addl 48(%rsp),%r12d
+ addl 52(%rsp),%r13d
+ addl 56(%rsp),%r14d
+ addl 60(%rsp),%r15d
+ paddd 32(%rsp),%xmm1
+
+ cmpq $64,%rbp
+ jb .Ltail
+
+ xorl 0(%rsi),%eax
+ xorl 4(%rsi),%ebx
+ xorl 8(%rsi),%ecx
+ xorl 12(%rsi),%edx
+ xorl 16(%rsi),%r8d
+ xorl 20(%rsi),%r9d
+ xorl 24(%rsi),%r10d
+ xorl 28(%rsi),%r11d
+ movdqu 32(%rsi),%xmm0
+ xorl 48(%rsi),%r12d
+ xorl 52(%rsi),%r13d
+ xorl 56(%rsi),%r14d
+ xorl 60(%rsi),%r15d
+ leaq 64(%rsi),%rsi
+ pxor %xmm1,%xmm0
+
+ movdqa %xmm2,32(%rsp)
+ movd %xmm3,48(%rsp)
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movl %r12d,48(%rdi)
+ movl %r13d,52(%rdi)
+ movl %r14d,56(%rdi)
+ movl %r15d,60(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rbp
+ jnz .Loop_outer
+
+ jmp .Ldone
+
+.align 16
+.Ltail:
+ movl %eax,0(%rsp)
+ movl %ebx,4(%rsp)
+ xorq %rbx,%rbx
+ movl %ecx,8(%rsp)
+ movl %edx,12(%rsp)
+ movl %r8d,16(%rsp)
+ movl %r9d,20(%rsp)
+ movl %r10d,24(%rsp)
+ movl %r11d,28(%rsp)
+ movdqa %xmm1,32(%rsp)
+ movl %r12d,48(%rsp)
+ movl %r13d,52(%rsp)
+ movl %r14d,56(%rsp)
+ movl %r15d,60(%rsp)
+
+.Loop_tail:
+ movzbl (%rsi,%rbx,1),%eax
+ movzbl (%rsp,%rbx,1),%edx
+ leaq 1(%rbx),%rbx
+ xorl %edx,%eax
+ movb %al,-1(%rdi,%rbx,1)
+ decq %rbp
+ jnz .Loop_tail
+
+.Ldone:
+ addq $64+24,%rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+.Lno_data:
+ .byte 0xf3,0xc3
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+.type ChaCha20_ssse3,@function
+.align 32
+ChaCha20_ssse3:
+.LChaCha20_ssse3:
+ cmpq $128,%rdx
+ ja .LChaCha20_4x
+
+.Ldo_sse3_after_all:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $64+24,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movl $10,%ebp
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movl $10,%ebp
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decl %ebp
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %rbx,%rbx
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%rbx,1),%eax
+ movzbl (%rsp,%rbx,1),%ecx
+ leaq 1(%rbx),%rbx
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%rbx,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ addq $64+24,%rsp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ .byte 0xf3,0xc3
+.size ChaCha20_ssse3,.-ChaCha20_ssse3
+.type ChaCha20_4x,@function
+.align 32
+ChaCha20_4x:
+.LChaCha20_4x:
+ movq %r10,%r11
+ shrq $32,%r10
+ testq $32,%r10
+ jnz .LChaCha20_8x
+ cmpq $192,%rdx
+ ja .Lproceed4x
+
+ andq $71303168,%r11
+ cmpq $4194304,%r11
+ je .Ldo_sse3_after_all
+
+.Lproceed4x:
+ leaq -120(%rsp),%r11
+ subq $0x148+0,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r10),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,199
+.byte 102,15,56,0,207
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,215
+.byte 102,15,56,0,223
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,223
+.byte 102,15,56,0,199
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,222
+.byte 102,15,56,0,198
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r10,%r10
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ addq $0x148+0,%rsp
+ .byte 0xf3,0xc3
+.size ChaCha20_4x,.-ChaCha20_4x
+.type ChaCha20_8x,@function
+.align 32
+ChaCha20_8x:
+.LChaCha20_8x:
+ movq %rsp,%r10
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+ movq %r10,640(%rsp)
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ movq 640(%rsp),%rsp
+ .byte 0xf3,0xc3
+.size ChaCha20_8x,.-ChaCha20_8x
+#endif
diff --git a/linux-x86_64/crypto/ec/p256-x86_64-asm.S b/linux-x86_64/crypto/ec/p256-x86_64-asm.S
index 2884c69b..4abce6f9 100644
--- a/linux-x86_64/crypto/ec/p256-x86_64-asm.S
+++ b/linux-x86_64/crypto/ec/p256-x86_64-asm.S
@@ -894,6 +894,7 @@ ecp_nistz256_point_double:
pushq %r15
subq $160+8,%rsp
+.Lpoint_double_shortcutq:
movdqu 0(%rsi),%xmm0
movq %rsi,%rbx
movdqu 16(%rsi),%xmm1
@@ -1115,7 +1116,7 @@ ecp_nistz256_point_add:
por %xmm1,%xmm3
movdqu 0(%rsi),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
por %xmm3,%xmm5
@@ -1125,7 +1126,7 @@ ecp_nistz256_point_add:
movq 64+16(%rsi),%r15
movq 64+24(%rsi),%r8
movdqa %xmm0,480(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,480+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1145,10 +1146,10 @@ ecp_nistz256_point_add:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
por %xmm3,%xmm4
pxor %xmm3,%xmm3
pcmpeqd %xmm3,%xmm4
@@ -1157,6 +1158,7 @@ ecp_nistz256_point_add:
movq 64+8(%rbx),%r14
movq 64+16(%rbx),%r15
movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
leaq 64-0(%rbx),%rsi
leaq 32(%rsp),%rdi
@@ -1248,7 +1250,7 @@ ecp_nistz256_point_add:
testq %r8,%r8
jnz .Ladd_proceedq
testq %r9,%r9
- jz .Ladd_proceedq
+ jz .Ladd_doubleq
.byte 102,72,15,126,199
pxor %xmm0,%xmm0
@@ -1261,6 +1263,13 @@ ecp_nistz256_point_add:
jmp .Ladd_doneq
.align 32
+.Ladd_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+ jmp .Lpoint_double_shortcutq
+
+.align 32
.Ladd_proceedq:
movq 0+64(%rsp),%rax
movq 8+64(%rsp),%r14
@@ -1508,13 +1517,13 @@ ecp_nistz256_point_add_affine:
por %xmm1,%xmm3
movdqu 0(%rbx),%xmm0
- pshufd $177,%xmm3,%xmm5
+ pshufd $0xb1,%xmm3,%xmm5
movdqu 16(%rbx),%xmm1
movdqu 32(%rbx),%xmm2
por %xmm3,%xmm5
movdqu 48(%rbx),%xmm3
movdqa %xmm0,416(%rsp)
- pshufd $30,%xmm5,%xmm4
+ pshufd $0x1e,%xmm5,%xmm4
movdqa %xmm1,416+16(%rsp)
por %xmm0,%xmm1
.byte 102,72,15,110,199
@@ -1530,13 +1539,13 @@ ecp_nistz256_point_add_affine:
call __ecp_nistz256_sqr_montq
pcmpeqd %xmm4,%xmm5
- pshufd $177,%xmm3,%xmm4
+ pshufd $0xb1,%xmm3,%xmm4
movq 0(%rbx),%rax
movq %r12,%r9
por %xmm3,%xmm4
pshufd $0,%xmm5,%xmm5
- pshufd $30,%xmm4,%xmm3
+ pshufd $0x1e,%xmm4,%xmm3
movq %r13,%r10
por %xmm3,%xmm4
pxor %xmm3,%xmm3
diff --git a/linux-x86_64/crypto/md5/md5-x86_64.S b/linux-x86_64/crypto/md5/md5-x86_64.S
index 76446896..05369e2a 100644
--- a/linux-x86_64/crypto/md5/md5-x86_64.S
+++ b/linux-x86_64/crypto/md5/md5-x86_64.S
@@ -495,14 +495,14 @@ md5_block_asm_data_order:
movl %ecx,%r11d
addl %ecx,%ebx
movl 0(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
xorl %edx,%r11d
leal -198630844(%rax,%r10,1),%eax
orl %ebx,%r11d
xorl %ecx,%r11d
addl %r11d,%eax
movl 28(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -511,7 +511,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 56(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -520,7 +520,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 20(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -529,7 +529,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 48(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
@@ -538,7 +538,7 @@ md5_block_asm_data_order:
xorl %ecx,%r11d
addl %r11d,%eax
movl 12(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -547,7 +547,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 40(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -556,7 +556,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 4(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -565,7 +565,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 32(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
@@ -574,7 +574,7 @@ md5_block_asm_data_order:
xorl %ecx,%r11d
addl %r11d,%eax
movl 60(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -583,7 +583,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 24(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -592,7 +592,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 52(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -601,7 +601,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 16(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
@@ -610,7 +610,7 @@ md5_block_asm_data_order:
xorl %ecx,%r11d
addl %r11d,%eax
movl 44(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $6,%eax
xorl %ecx,%r11d
addl %ebx,%eax
@@ -619,7 +619,7 @@ md5_block_asm_data_order:
xorl %ebx,%r11d
addl %r11d,%edx
movl 8(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $10,%edx
xorl %ebx,%r11d
addl %eax,%edx
@@ -628,7 +628,7 @@ md5_block_asm_data_order:
xorl %eax,%r11d
addl %r11d,%ecx
movl 36(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $15,%ecx
xorl %eax,%r11d
addl %edx,%ecx
@@ -637,7 +637,7 @@ md5_block_asm_data_order:
xorl %edx,%r11d
addl %r11d,%ebx
movl 0(%rsi),%r10d
- movl $4294967295,%r11d
+ movl $0xffffffff,%r11d
roll $21,%ebx
xorl %edx,%r11d
addl %ecx,%ebx
diff --git a/linux-x86_64/crypto/modes/ghash-x86_64.S b/linux-x86_64/crypto/modes/ghash-x86_64.S
index 1db7d69d..b47bdc9b 100644
--- a/linux-x86_64/crypto/modes/ghash-x86_64.S
+++ b/linux-x86_64/crypto/modes/ghash-x86_64.S
@@ -23,14 +23,14 @@ gcm_gmult_4bit:
movq $14,%rcx
movq 8(%rsi,%rax,1),%r8
movq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
movq %r8,%rdx
jmp .Loop1
.align 16
.Loop1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
movb (%rdi,%rcx,1),%al
shrq $4,%r9
@@ -46,13 +46,13 @@ gcm_gmult_4bit:
js .Lbreak1
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
@@ -61,19 +61,19 @@ gcm_gmult_4bit:
.align 16
.Lbreak1:
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rax,1),%r8
shlq $60,%r10
xorq (%rsi,%rax,1),%r9
- andb $240,%bl
+ andb $0xf0,%bl
xorq (%r11,%rdx,8),%r9
movq %r8,%rdx
xorq %r10,%r8
shrq $4,%r8
- andq $15,%rdx
+ andq $0xf,%rdx
movq %r9,%r10
shrq $4,%r9
xorq 8(%rsi,%rbx,1),%r8
@@ -881,20 +881,20 @@ gcm_ghash_clmul:
movdqu 32(%rsi),%xmm7
.byte 102,65,15,56,0,194
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
movdqu 16(%rsi),%xmm6
movl OPENSSL_ia32cap_P+4(%rip),%eax
- cmpq $48,%rcx
+ cmpq $0x30,%rcx
jb .Lskip4x
andl $71303168,%eax
cmpl $4194304,%eax
je .Lskip4x
- subq $48,%rcx
- movq $11547335547999543296,%rax
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
movdqu 48(%rsi),%xmm14
movdqu 64(%rsi),%xmm15
@@ -941,7 +941,7 @@ gcm_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jc .Ltail4x
jmp .Lmod4_loop
@@ -1024,7 +1024,7 @@ gcm_ghash_clmul:
xorps %xmm13,%xmm5
leaq 64(%rdx),%rdx
- subq $64,%rcx
+ subq $0x40,%rcx
jnc .Lmod4_loop
.Ltail4x:
@@ -1068,10 +1068,10 @@ gcm_ghash_clmul:
pxor %xmm4,%xmm0
psrlq $1,%xmm0
pxor %xmm1,%xmm0
- addq $64,%rcx
+ addq $0x40,%rcx
jz .Ldone
movdqu 32(%rsi),%xmm7
- subq $16,%rcx
+ subq $0x10,%rcx
jz .Lodd_tail
.Lskip4x:
@@ -1094,7 +1094,7 @@ gcm_ghash_clmul:
leaq 32(%rdx),%rdx
nop
- subq $32,%rcx
+ subq $0x20,%rcx
jbe .Leven_tail
nop
jmp .Lmod_loop
@@ -1157,7 +1157,7 @@ gcm_ghash_clmul:
.byte 102,15,58,68,231,0
pxor %xmm1,%xmm0
- subq $32,%rcx
+ subq $0x20,%rcx
ja .Lmod_loop
.Leven_tail: