diff options
27 files changed, 1220 insertions, 8514 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION index 42ad2f07..6b8d547c 100644 --- a/BORINGSSL_REVISION +++ b/BORINGSSL_REVISION @@ -1 +1 @@ -fdb48f98612e934eab339b4871484b1c987553e2 +df11bed9ee05141b54da7b88cc5b7960ca858164 @@ -348,7 +348,6 @@ linux_x86_64_sources := \ linux-x86_64/crypto/fipsmodule/aes-x86_64.S\ linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S\ linux-x86_64/crypto/fipsmodule/aesni-x86_64.S\ - linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S\ linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S\ linux-x86_64/crypto/fipsmodule/ghash-x86_64.S\ linux-x86_64/crypto/fipsmodule/md5-x86_64.S\ diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S deleted file mode 100644 index 5437762f..00000000 --- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ /dev/null @@ -1,1622 +0,0 @@ -# This file is generated from a similarly-named Perl script in the BoringSSL -# source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include <boringssl_prefix_symbols_asm.h> -#endif -.text - -.type _bsaes_encrypt8,@function -.align 64 -_bsaes_encrypt8: -.cfi_startproc - leaq .LBS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa 80(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp .Lenc_sbox -.align 16 -.Lenc_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -.Lenc_sbox: - pxor %xmm5,%xmm4 - pxor %xmm0,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm1,%xmm5 - pxor %xmm15,%xmm4 - - pxor %xmm2,%xmm5 - pxor %xmm6,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm3,%xmm2 - pxor %xmm4,%xmm3 - pxor %xmm0,%xmm2 - - pxor %xmm6,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm6,%xmm10 - movdqa %xmm0,%xmm9 - movdqa %xmm4,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm11 - - pxor %xmm3,%xmm10 - pxor %xmm1,%xmm9 - pxor %xmm2,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm3,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm15,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm2,%xmm11 - pxor %xmm15,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm6,%xmm12 - movdqa %xmm4,%xmm11 - pxor %xmm0,%xmm12 - pxor %xmm5,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm1,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm3,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm0,%xmm13 - pand %xmm2,%xmm11 - movdqa %xmm6,%xmm14 - pand %xmm15,%xmm12 - pand %xmm4,%xmm13 - por %xmm5,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm5,%xmm11 - movdqa %xmm4,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm5,%xmm9 - pxor %xmm4,%xmm5 - pand %xmm14,%xmm4 - pand %xmm13,%xmm5 - pxor %xmm4,%xmm5 - pxor %xmm9,%xmm4 - pxor %xmm15,%xmm11 - pxor %xmm2,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm2,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm2 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm2,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm11,%xmm5 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm2 - - movdqa %xmm6,%xmm11 - movdqa %xmm0,%xmm7 - pxor %xmm3,%xmm11 - pxor %xmm1,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm3,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm1,%xmm3 - pand %xmm14,%xmm7 - pand %xmm12,%xmm1 - pand %xmm13,%xmm11 - pand %xmm8,%xmm3 - pxor %xmm11,%xmm7 - pxor %xmm1,%xmm3 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm1 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm6,%xmm10 - pxor %xmm0,%xmm6 - pand %xmm14,%xmm0 - pand %xmm13,%xmm6 - pxor %xmm0,%xmm6 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm6 - pxor %xmm11,%xmm3 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm1 - pxor %xmm15,%xmm6 - pxor %xmm5,%xmm0 - pxor %xmm6,%xmm3 - pxor %xmm15,%xmm5 - pxor %xmm0,%xmm15 - - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 - pxor %xmm2,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm4,%xmm3 - - pxor %xmm2,%xmm5 - decl %r10d - jl .Lenc_done - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm3,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm5,%xmm10 - pxor %xmm9,%xmm3 - pshufd $0x93,%xmm2,%xmm11 - pxor %xmm10,%xmm5 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm2 - pshufd $0x93,%xmm1,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm1 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm2,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm5,%xmm11 - pshufd $0x4E,%xmm2,%xmm7 - pxor %xmm1,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm3,%xmm10 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm1,%xmm5 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm12,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm14,%xmm6 - pxor %xmm13,%xmm5 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm1 - movdqa %xmm8,%xmm4 - movdqa 48(%r11),%xmm7 - jnz .Lenc_loop - movdqa 64(%r11),%xmm7 - jmp .Lenc_loop -.align 16 -.Lenc_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm2,%xmm10 - psrlq $1,%xmm2 - pxor %xmm4,%xmm1 - pxor %xmm6,%xmm2 - pand %xmm7,%xmm1 - pand %xmm7,%xmm2 - pxor %xmm1,%xmm4 - psllq $1,%xmm1 - pxor %xmm2,%xmm6 - psllq $1,%xmm2 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm2 - movdqa %xmm3,%xmm9 - psrlq $1,%xmm3 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm5,%xmm3 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm3 - pand %xmm7,%xmm15 - pxor %xmm3,%xmm5 - psllq $1,%xmm3 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm2,%xmm10 - psrlq $2,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm1,%xmm2 - pand %xmm8,%xmm6 - pand %xmm8,%xmm2 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm2,%xmm1 - psllq $2,%xmm2 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm2 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm5,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm5 - psllq $2,%xmm0 - pxor %xmm15,%xmm3 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm5,%xmm9 - psrlq $4,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $4,%xmm3 - pxor %xmm4,%xmm5 - pxor %xmm1,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm4 - psllq $4,%xmm5 - pxor %xmm3,%xmm1 - psllq $4,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm2,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm2 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 -.cfi_endproc -.size _bsaes_encrypt8,.-_bsaes_encrypt8 - -.type _bsaes_decrypt8,@function -.align 64 -_bsaes_decrypt8: -.cfi_startproc - leaq .LBS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa -48(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp .Ldec_sbox -.align 16 -.Ldec_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -.Ldec_sbox: - pxor %xmm3,%xmm2 - - pxor %xmm6,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm3,%xmm5 - pxor %xmm5,%xmm6 - pxor %xmm6,%xmm0 - - pxor %xmm0,%xmm15 - pxor %xmm4,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm15,%xmm4 - pxor %xmm2,%xmm0 - movdqa %xmm2,%xmm10 - movdqa %xmm6,%xmm9 - movdqa %xmm0,%xmm8 - movdqa %xmm3,%xmm12 - movdqa %xmm4,%xmm11 - - pxor %xmm15,%xmm10 - pxor %xmm3,%xmm9 - pxor %xmm5,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm15,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm1,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm5,%xmm11 - pxor %xmm1,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm2,%xmm12 - movdqa %xmm0,%xmm11 - pxor %xmm6,%xmm12 - pxor %xmm4,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm3,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm15,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm6,%xmm13 - pand %xmm5,%xmm11 - movdqa %xmm2,%xmm14 - pand %xmm1,%xmm12 - pand %xmm0,%xmm13 - por %xmm4,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm4,%xmm11 - movdqa %xmm0,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm4,%xmm9 - pxor %xmm0,%xmm4 - pand %xmm14,%xmm0 - pand %xmm13,%xmm4 - pxor %xmm0,%xmm4 - pxor %xmm9,%xmm0 - pxor %xmm1,%xmm11 - pxor %xmm5,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm1,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm5,%xmm1 - pand %xmm14,%xmm7 - pand %xmm12,%xmm5 - pand %xmm13,%xmm11 - pand %xmm8,%xmm1 - pxor %xmm11,%xmm7 - pxor %xmm5,%xmm1 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm5 - pxor %xmm11,%xmm4 - pxor %xmm11,%xmm1 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm5 - - movdqa %xmm2,%xmm11 - movdqa %xmm6,%xmm7 - pxor %xmm15,%xmm11 - pxor %xmm3,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm3,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm3 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm3,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm3 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm2,%xmm10 - pxor %xmm6,%xmm2 - pand %xmm14,%xmm6 - pand %xmm13,%xmm2 - pxor %xmm6,%xmm2 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm4,%xmm5 - - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm6,%xmm4 - pxor %xmm1,%xmm3 - pxor %xmm15,%xmm6 - pxor %xmm4,%xmm3 - pxor %xmm5,%xmm2 - pxor %xmm0,%xmm5 - pxor %xmm3,%xmm2 - - pxor %xmm15,%xmm3 - pxor %xmm2,%xmm6 - decl %r10d - jl .Ldec_done - - pshufd $0x4E,%xmm15,%xmm7 - pshufd $0x4E,%xmm2,%xmm13 - pxor %xmm15,%xmm7 - pshufd $0x4E,%xmm4,%xmm14 - pxor %xmm2,%xmm13 - pshufd $0x4E,%xmm0,%xmm8 - pxor %xmm4,%xmm14 - pshufd $0x4E,%xmm5,%xmm9 - pxor %xmm0,%xmm8 - pshufd $0x4E,%xmm3,%xmm10 - pxor %xmm5,%xmm9 - pxor %xmm13,%xmm15 - pxor %xmm13,%xmm0 - pshufd $0x4E,%xmm1,%xmm11 - pxor %xmm3,%xmm10 - pxor %xmm7,%xmm5 - pxor %xmm8,%xmm3 - pshufd $0x4E,%xmm6,%xmm12 - pxor %xmm1,%xmm11 - pxor %xmm14,%xmm0 - pxor %xmm9,%xmm1 - pxor %xmm6,%xmm12 - - pxor %xmm14,%xmm5 - pxor %xmm13,%xmm3 - pxor %xmm13,%xmm1 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm14,%xmm1 - pxor %xmm14,%xmm6 - pxor %xmm12,%xmm4 - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm5,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm3,%xmm10 - pxor %xmm9,%xmm5 - pshufd $0x93,%xmm1,%xmm11 - pxor %xmm10,%xmm3 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm1 - pshufd $0x93,%xmm2,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm2 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm1,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm3,%xmm11 - pshufd $0x4E,%xmm1,%xmm7 - pxor %xmm2,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm5,%xmm10 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm2,%xmm3 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm12,%xmm8 - pxor %xmm1,%xmm10 - pxor %xmm14,%xmm6 - pxor %xmm3,%xmm13 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm2 - movdqa %xmm13,%xmm5 - movdqa %xmm8,%xmm4 - movdqa %xmm2,%xmm1 - movdqa %xmm10,%xmm2 - movdqa -16(%r11),%xmm7 - jnz .Ldec_loop - movdqa -32(%r11),%xmm7 - jmp .Ldec_loop -.align 16 -.Ldec_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm2,%xmm9 - psrlq $1,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $1,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm6,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm4 - psllq $1,%xmm2 - pxor %xmm1,%xmm6 - psllq $1,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm3,%xmm5 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm5 - pand %xmm7,%xmm15 - pxor %xmm5,%xmm3 - psllq $1,%xmm5 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm1,%xmm10 - psrlq $2,%xmm1 - pxor %xmm4,%xmm6 - pxor %xmm2,%xmm1 - pand %xmm8,%xmm6 - pand %xmm8,%xmm1 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm1,%xmm2 - psllq $2,%xmm1 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm3 - psllq $2,%xmm0 - pxor %xmm15,%xmm5 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm3,%xmm9 - psrlq $4,%xmm3 - movdqa %xmm5,%xmm10 - psrlq $4,%xmm5 - pxor %xmm4,%xmm3 - pxor %xmm2,%xmm5 - pand %xmm7,%xmm3 - pand %xmm7,%xmm5 - pxor %xmm3,%xmm4 - psllq $4,%xmm3 - pxor %xmm5,%xmm2 - psllq $4,%xmm5 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm5 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm1 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 -.cfi_endproc -.size _bsaes_decrypt8,.-_bsaes_decrypt8 -.type _bsaes_key_convert,@function -.align 16 -_bsaes_key_convert: -.cfi_startproc - leaq .Lmasks(%rip),%r11 - movdqu (%rcx),%xmm7 - leaq 16(%rcx),%rcx - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - movdqa 48(%r11),%xmm3 - movdqa 64(%r11),%xmm4 - pcmpeqd %xmm5,%xmm5 - - movdqu (%rcx),%xmm6 - movdqa %xmm7,(%rax) - leaq 16(%rax),%rax - decl %r10d - jmp .Lkey_loop -.align 16 -.Lkey_loop: -.byte 102,15,56,0,244 - - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - - pand %xmm6,%xmm8 - pand %xmm6,%xmm9 - movdqa %xmm2,%xmm10 - pcmpeqb %xmm0,%xmm8 - psllq $4,%xmm0 - movdqa %xmm3,%xmm11 - pcmpeqb %xmm1,%xmm9 - psllq $4,%xmm1 - - pand %xmm6,%xmm10 - pand %xmm6,%xmm11 - movdqa %xmm0,%xmm12 - pcmpeqb %xmm2,%xmm10 - psllq $4,%xmm2 - movdqa %xmm1,%xmm13 - pcmpeqb %xmm3,%xmm11 - psllq $4,%xmm3 - - movdqa %xmm2,%xmm14 - movdqa %xmm3,%xmm15 - pxor %xmm5,%xmm8 - pxor %xmm5,%xmm9 - - pand %xmm6,%xmm12 - pand %xmm6,%xmm13 - movdqa %xmm8,0(%rax) - pcmpeqb %xmm0,%xmm12 - psrlq $4,%xmm0 - movdqa %xmm9,16(%rax) - pcmpeqb %xmm1,%xmm13 - psrlq $4,%xmm1 - leaq 16(%rcx),%rcx - - pand %xmm6,%xmm14 - pand %xmm6,%xmm15 - movdqa %xmm10,32(%rax) - pcmpeqb %xmm2,%xmm14 - psrlq $4,%xmm2 - movdqa %xmm11,48(%rax) - pcmpeqb %xmm3,%xmm15 - psrlq $4,%xmm3 - movdqu (%rcx),%xmm6 - - pxor %xmm5,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm12,64(%rax) - movdqa %xmm13,80(%rax) - movdqa %xmm14,96(%rax) - movdqa %xmm15,112(%rax) - leaq 128(%rax),%rax - decl %r10d - jnz .Lkey_loop - - movdqa 80(%r11),%xmm7 - - .byte 0xf3,0xc3 -.cfi_endproc -.size _bsaes_key_convert,.-_bsaes_key_convert -.globl bsaes_cbc_encrypt -.hidden bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,@function -.align 16 -bsaes_cbc_encrypt: -.cfi_startproc - - - - movq %rsp,%rax -.Lcbc_dec_prologue: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -72(%rsp),%rsp -.cfi_adjust_cfa_offset 0x48 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movq %r8,%rbx - shrq $4,%r14 - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - movdqu (%rbx),%xmm14 - subq $8,%r14 - jc .Lcbc_dec_loop_done - -.Lcbc_dec_loop: - movdqu 0(%r12),%xmm15 - movdqu 16(%r12),%xmm0 - movdqu 32(%r12),%xmm1 - movdqu 48(%r12),%xmm2 - movdqu 64(%r12),%xmm3 - movdqu 80(%r12),%xmm4 - movq %rsp,%rax - movdqu 96(%r12),%xmm5 - movl %edx,%r10d - movdqu 112(%r12),%xmm6 - movdqa %xmm14,32(%rbp) - - call _bsaes_decrypt8 - - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm2 - movdqu 112(%r12),%xmm14 - pxor %xmm13,%xmm4 - movdqu %xmm15,0(%r13) - leaq 128(%r12),%r12 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - subq $8,%r14 - jnc .Lcbc_dec_loop - -.Lcbc_dec_loop_done: - addq $8,%r14 - jz .Lcbc_dec_done - - movdqu 0(%r12),%xmm15 - movq %rsp,%rax - movl %edx,%r10d - cmpq $2,%r14 - jb .Lcbc_dec_one - movdqu 16(%r12),%xmm0 - je .Lcbc_dec_two - movdqu 32(%r12),%xmm1 - cmpq $4,%r14 - jb .Lcbc_dec_three - movdqu 48(%r12),%xmm2 - je .Lcbc_dec_four - movdqu 64(%r12),%xmm3 - cmpq $6,%r14 - jb .Lcbc_dec_five - movdqu 80(%r12),%xmm4 - je .Lcbc_dec_six - movdqu 96(%r12),%xmm5 - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm14 - pxor %xmm12,%xmm2 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_six: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm14 - pxor %xmm11,%xmm6 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_five: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm14 - pxor %xmm10,%xmm1 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_four: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm14 - pxor %xmm9,%xmm3 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_three: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm14 - pxor %xmm8,%xmm5 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_two: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm14 - pxor %xmm7,%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_one: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm14 - movdqu %xmm15,0(%r13) - jmp .Lcbc_dec_done - -.Lcbc_dec_done: - movdqu %xmm14,(%rbx) - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lcbc_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lcbc_dec_bzero - - leaq 120(%rbp),%rax -.cfi_def_cfa %rax,8 - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbx -.cfi_restore %rbx - movq -8(%rax),%rbp -.cfi_restore %rbp - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lcbc_dec_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - -.globl bsaes_ctr32_encrypt_blocks -.hidden bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,@function -.align 16 -bsaes_ctr32_encrypt_blocks: -.cfi_startproc -#ifndef NDEBUG -#ifndef BORINGSSL_FIPS -.extern BORINGSSL_function_hit -.hidden BORINGSSL_function_hit - movb $1,BORINGSSL_function_hit+6(%rip) -#endif -#endif - movq %rsp,%rax -.Lctr_enc_prologue: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -72(%rsp),%rsp -.cfi_adjust_cfa_offset 0x48 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp - movdqu (%r8),%xmm0 - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movdqa %xmm0,32(%rbp) - - - - movl %eax,%ebx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %ebx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - movdqa (%rsp),%xmm8 - leaq .LADD1(%rip),%r11 - movdqa 32(%rbp),%xmm15 - movdqa -32(%r11),%xmm7 -.byte 102,68,15,56,0,199 -.byte 102,68,15,56,0,255 - movdqa %xmm8,(%rsp) - jmp .Lctr_enc_loop -.align 16 -.Lctr_enc_loop: - movdqa %xmm15,32(%rbp) - movdqa %xmm15,%xmm0 - movdqa %xmm15,%xmm1 - paddd 0(%r11),%xmm0 - movdqa %xmm15,%xmm2 - paddd 16(%r11),%xmm1 - movdqa %xmm15,%xmm3 - paddd 32(%r11),%xmm2 - movdqa %xmm15,%xmm4 - paddd 48(%r11),%xmm3 - movdqa %xmm15,%xmm5 - paddd 64(%r11),%xmm4 - movdqa %xmm15,%xmm6 - paddd 80(%r11),%xmm5 - paddd 96(%r11),%xmm6 - - - - movdqa (%rsp),%xmm8 - leaq 16(%rsp),%rax - movdqa -16(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq .LBS0(%rip),%r11 - movl %ebx,%r10d - - call _bsaes_encrypt8_bitslice - - subq $8,%r14 - jc .Lctr_enc_loop_done - - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - movdqu 32(%r12),%xmm9 - movdqu 48(%r12),%xmm10 - movdqu 64(%r12),%xmm11 - movdqu 80(%r12),%xmm12 - movdqu 96(%r12),%xmm13 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - pxor %xmm15,%xmm7 - movdqa 32(%rbp),%xmm15 - pxor %xmm8,%xmm0 - movdqu %xmm7,0(%r13) - pxor %xmm9,%xmm3 - movdqu %xmm0,16(%r13) - pxor %xmm10,%xmm5 - movdqu %xmm3,32(%r13) - pxor %xmm11,%xmm2 - movdqu %xmm5,48(%r13) - pxor %xmm12,%xmm6 - movdqu %xmm2,64(%r13) - pxor %xmm13,%xmm1 - movdqu %xmm6,80(%r13) - pxor %xmm14,%xmm4 - movdqu %xmm1,96(%r13) - leaq .LADD1(%rip),%r11 - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - paddd 112(%r11),%xmm15 - jnz .Lctr_enc_loop - - jmp .Lctr_enc_done -.align 16 -.Lctr_enc_loop_done: - addq $8,%r14 - movdqu 0(%r12),%xmm7 - pxor %xmm7,%xmm15 - movdqu %xmm15,0(%r13) - cmpq $2,%r14 - jb .Lctr_enc_done - movdqu 16(%r12),%xmm8 - pxor %xmm8,%xmm0 - movdqu %xmm0,16(%r13) - je .Lctr_enc_done - movdqu 32(%r12),%xmm9 - pxor %xmm9,%xmm3 - movdqu %xmm3,32(%r13) - cmpq $4,%r14 - jb .Lctr_enc_done - movdqu 48(%r12),%xmm10 - pxor %xmm10,%xmm5 - movdqu %xmm5,48(%r13) - je .Lctr_enc_done - movdqu 64(%r12),%xmm11 - pxor %xmm11,%xmm2 - movdqu %xmm2,64(%r13) - cmpq $6,%r14 - jb .Lctr_enc_done - movdqu 80(%r12),%xmm12 - pxor %xmm12,%xmm6 - movdqu %xmm6,80(%r13) - je .Lctr_enc_done - movdqu 96(%r12),%xmm13 - pxor %xmm13,%xmm1 - movdqu %xmm1,96(%r13) - - - -.Lctr_enc_done: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lctr_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lctr_enc_bzero - - leaq 120(%rbp),%rax -.cfi_def_cfa %rax,8 - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbx -.cfi_restore %rbx - movq -8(%rax),%rbp -.cfi_restore %rbp - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lctr_enc_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.type _bsaes_const,@object -.align 64 -_bsaes_const: -.LM0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -.LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -.LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -.LBS0: -.quad 0x5555555555555555, 0x5555555555555555 -.LBS1: -.quad 0x3333333333333333, 0x3333333333333333 -.LBS2: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -.LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -.LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -.LM0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LSWPUP: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -.LSWPUPM0SR: -.quad 0x0a0d02060c03070b, 0x0004080f05090e01 -.LADD1: -.quad 0x0000000000000000, 0x0000000100000000 -.LADD2: -.quad 0x0000000000000000, 0x0000000200000000 -.LADD3: -.quad 0x0000000000000000, 0x0000000300000000 -.LADD4: -.quad 0x0000000000000000, 0x0000000400000000 -.LADD5: -.quad 0x0000000000000000, 0x0000000500000000 -.LADD6: -.quad 0x0000000000000000, 0x0000000600000000 -.LADD7: -.quad 0x0000000000000000, 0x0000000700000000 -.LADD8: -.quad 0x0000000000000000, 0x0000000800000000 -.Lxts_magic: -.long 0x87,0,1,0 -.Lmasks: -.quad 0x0101010101010101, 0x0101010101010101 -.quad 0x0202020202020202, 0x0202020202020202 -.quad 0x0404040404040404, 0x0404040404040404 -.quad 0x0808080808080808, 0x0808080808080808 -.LM0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -.L63: -.quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 -.align 64 -.size _bsaes_const,.-_bsaes_const -#endif diff --git a/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 4355438e..0fc93f9a 100644 --- a/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -120,6 +120,181 @@ _vpaes_encrypt_core: + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core_2x,@function +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa .Lk_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + + movdqa .Lk_sb1(%rip),%xmm4 + movdqa .Lk_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa .Lk_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa .Lk_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +.Lenc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz .Lenc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + .byte 0xf3,0xc3 +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + + + + + + .type _vpaes_decrypt_core,@function .align 16 _vpaes_decrypt_core: @@ -759,6 +934,69 @@ vpaes_cbc_encrypt: .byte 0xf3,0xc3 .cfi_endproc .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,@function +.align 16 +vpaes_ctr32_encrypt_blocks: +.cfi_startproc + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz .Lctr32_abort + movdqu (%r8),%xmm0 + movdqa .Lctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb .Lrev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz .Lctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz .Lctr32_done + +.Lctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa .Lctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz .Lctr32_loop + +.Lctr32_done: +.Lctr32_abort: + .byte 0xf3,0xc3 +.cfi_endproc +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks @@ -881,6 +1119,17 @@ _vpaes_consts: .Lk_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +.Lrev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +.Lctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .align 64 .size _vpaes_consts,.-_vpaes_consts diff --git a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S deleted file mode 100644 index c2807e38..00000000 --- a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ /dev/null @@ -1,1609 +0,0 @@ -# This file is generated from a similarly-named Perl script in the BoringSSL -# source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include <boringssl_prefix_symbols_asm.h> -#endif -.text - - -.p2align 6 -_bsaes_encrypt8: - - leaq L$BS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa 80(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp L$enc_sbox -.p2align 4 -L$enc_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -L$enc_sbox: - pxor %xmm5,%xmm4 - pxor %xmm0,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm1,%xmm5 - pxor %xmm15,%xmm4 - - pxor %xmm2,%xmm5 - pxor %xmm6,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm3,%xmm2 - pxor %xmm4,%xmm3 - pxor %xmm0,%xmm2 - - pxor %xmm6,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm6,%xmm10 - movdqa %xmm0,%xmm9 - movdqa %xmm4,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm11 - - pxor %xmm3,%xmm10 - pxor %xmm1,%xmm9 - pxor %xmm2,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm3,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm15,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm2,%xmm11 - pxor %xmm15,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm6,%xmm12 - movdqa %xmm4,%xmm11 - pxor %xmm0,%xmm12 - pxor %xmm5,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm1,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm3,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm0,%xmm13 - pand %xmm2,%xmm11 - movdqa %xmm6,%xmm14 - pand %xmm15,%xmm12 - pand %xmm4,%xmm13 - por %xmm5,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm5,%xmm11 - movdqa %xmm4,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm5,%xmm9 - pxor %xmm4,%xmm5 - pand %xmm14,%xmm4 - pand %xmm13,%xmm5 - pxor %xmm4,%xmm5 - pxor %xmm9,%xmm4 - pxor %xmm15,%xmm11 - pxor %xmm2,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm2,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm2 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm2,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm11,%xmm5 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm2 - - movdqa %xmm6,%xmm11 - movdqa %xmm0,%xmm7 - pxor %xmm3,%xmm11 - pxor %xmm1,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm3,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm1,%xmm3 - pand %xmm14,%xmm7 - pand %xmm12,%xmm1 - pand %xmm13,%xmm11 - pand %xmm8,%xmm3 - pxor %xmm11,%xmm7 - pxor %xmm1,%xmm3 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm1 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm6,%xmm10 - pxor %xmm0,%xmm6 - pand %xmm14,%xmm0 - pand %xmm13,%xmm6 - pxor %xmm0,%xmm6 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm6 - pxor %xmm11,%xmm3 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm1 - pxor %xmm15,%xmm6 - pxor %xmm5,%xmm0 - pxor %xmm6,%xmm3 - pxor %xmm15,%xmm5 - pxor %xmm0,%xmm15 - - pxor %xmm4,%xmm0 - pxor %xmm1,%xmm4 - pxor %xmm2,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm4,%xmm3 - - pxor %xmm2,%xmm5 - decl %r10d - jl L$enc_done - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm3,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm5,%xmm10 - pxor %xmm9,%xmm3 - pshufd $0x93,%xmm2,%xmm11 - pxor %xmm10,%xmm5 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm2 - pshufd $0x93,%xmm1,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm1 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm2,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm5,%xmm11 - pshufd $0x4E,%xmm2,%xmm7 - pxor %xmm1,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm3,%xmm10 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm1,%xmm5 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm12,%xmm8 - pxor %xmm10,%xmm2 - pxor %xmm14,%xmm6 - pxor %xmm13,%xmm5 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm1 - movdqa %xmm8,%xmm4 - movdqa 48(%r11),%xmm7 - jnz L$enc_loop - movdqa 64(%r11),%xmm7 - jmp L$enc_loop -.p2align 4 -L$enc_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm2,%xmm10 - psrlq $1,%xmm2 - pxor %xmm4,%xmm1 - pxor %xmm6,%xmm2 - pand %xmm7,%xmm1 - pand %xmm7,%xmm2 - pxor %xmm1,%xmm4 - psllq $1,%xmm1 - pxor %xmm2,%xmm6 - psllq $1,%xmm2 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm2 - movdqa %xmm3,%xmm9 - psrlq $1,%xmm3 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm5,%xmm3 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm3 - pand %xmm7,%xmm15 - pxor %xmm3,%xmm5 - psllq $1,%xmm3 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm2,%xmm10 - psrlq $2,%xmm2 - pxor %xmm4,%xmm6 - pxor %xmm1,%xmm2 - pand %xmm8,%xmm6 - pand %xmm8,%xmm2 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm2,%xmm1 - psllq $2,%xmm2 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm2 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm5,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm5 - psllq $2,%xmm0 - pxor %xmm15,%xmm3 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm5,%xmm9 - psrlq $4,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $4,%xmm3 - pxor %xmm4,%xmm5 - pxor %xmm1,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm4 - psllq $4,%xmm5 - pxor %xmm3,%xmm1 - psllq $4,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm2,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm2 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 - - - - -.p2align 6 -_bsaes_decrypt8: - - leaq L$BS0(%rip),%r11 - - movdqa (%rax),%xmm8 - leaq 16(%rax),%rax - movdqa -48(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm3,%xmm10 - psrlq $1,%xmm3 - pxor %xmm6,%xmm5 - pxor %xmm4,%xmm3 - pand %xmm7,%xmm5 - pand %xmm7,%xmm3 - pxor %xmm5,%xmm6 - psllq $1,%xmm5 - pxor %xmm3,%xmm4 - psllq $1,%xmm3 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm3 - movdqa %xmm1,%xmm9 - psrlq $1,%xmm1 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm2,%xmm1 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm1 - pand %xmm7,%xmm15 - pxor %xmm1,%xmm2 - psllq $1,%xmm1 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm1 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm4,%xmm9 - psrlq $2,%xmm4 - movdqa %xmm3,%xmm10 - psrlq $2,%xmm3 - pxor %xmm6,%xmm4 - pxor %xmm5,%xmm3 - pand %xmm8,%xmm4 - pand %xmm8,%xmm3 - pxor %xmm4,%xmm6 - psllq $2,%xmm4 - pxor %xmm3,%xmm5 - psllq $2,%xmm3 - pxor %xmm9,%xmm4 - pxor %xmm10,%xmm3 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm2,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm2 - psllq $2,%xmm0 - pxor %xmm15,%xmm1 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm2,%xmm9 - psrlq $4,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $4,%xmm1 - pxor %xmm6,%xmm2 - pxor %xmm5,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm6 - psllq $4,%xmm2 - pxor %xmm1,%xmm5 - psllq $4,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm4 - psllq $4,%xmm0 - pxor %xmm15,%xmm3 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - decl %r10d - jmp L$dec_sbox -.p2align 4 -L$dec_loop: - pxor 0(%rax),%xmm15 - pxor 16(%rax),%xmm0 - pxor 32(%rax),%xmm1 - pxor 48(%rax),%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor 64(%rax),%xmm3 - pxor 80(%rax),%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor 96(%rax),%xmm5 - pxor 112(%rax),%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq 128(%rax),%rax -L$dec_sbox: - pxor %xmm3,%xmm2 - - pxor %xmm6,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm3,%xmm5 - pxor %xmm5,%xmm6 - pxor %xmm6,%xmm0 - - pxor %xmm0,%xmm15 - pxor %xmm4,%xmm1 - pxor %xmm15,%xmm2 - pxor %xmm15,%xmm4 - pxor %xmm2,%xmm0 - movdqa %xmm2,%xmm10 - movdqa %xmm6,%xmm9 - movdqa %xmm0,%xmm8 - movdqa %xmm3,%xmm12 - movdqa %xmm4,%xmm11 - - pxor %xmm15,%xmm10 - pxor %xmm3,%xmm9 - pxor %xmm5,%xmm8 - movdqa %xmm10,%xmm13 - pxor %xmm15,%xmm12 - movdqa %xmm9,%xmm7 - pxor %xmm1,%xmm11 - movdqa %xmm10,%xmm14 - - por %xmm8,%xmm9 - por %xmm11,%xmm10 - pxor %xmm7,%xmm14 - pand %xmm11,%xmm13 - pxor %xmm8,%xmm11 - pand %xmm8,%xmm7 - pand %xmm11,%xmm14 - movdqa %xmm5,%xmm11 - pxor %xmm1,%xmm11 - pand %xmm11,%xmm12 - pxor %xmm12,%xmm10 - pxor %xmm12,%xmm9 - movdqa %xmm2,%xmm12 - movdqa %xmm0,%xmm11 - pxor %xmm6,%xmm12 - pxor %xmm4,%xmm11 - movdqa %xmm12,%xmm8 - pand %xmm11,%xmm12 - por %xmm11,%xmm8 - pxor %xmm12,%xmm7 - pxor %xmm14,%xmm10 - pxor %xmm13,%xmm9 - pxor %xmm14,%xmm8 - movdqa %xmm3,%xmm11 - pxor %xmm13,%xmm7 - movdqa %xmm15,%xmm12 - pxor %xmm13,%xmm8 - movdqa %xmm6,%xmm13 - pand %xmm5,%xmm11 - movdqa %xmm2,%xmm14 - pand %xmm1,%xmm12 - pand %xmm0,%xmm13 - por %xmm4,%xmm14 - pxor %xmm11,%xmm10 - pxor %xmm12,%xmm9 - pxor %xmm13,%xmm8 - pxor %xmm14,%xmm7 - - - - - - movdqa %xmm10,%xmm11 - pand %xmm8,%xmm10 - pxor %xmm9,%xmm11 - - movdqa %xmm7,%xmm13 - movdqa %xmm11,%xmm14 - pxor %xmm10,%xmm13 - pand %xmm13,%xmm14 - - movdqa %xmm8,%xmm12 - pxor %xmm9,%xmm14 - pxor %xmm7,%xmm12 - - pxor %xmm9,%xmm10 - - pand %xmm10,%xmm12 - - movdqa %xmm13,%xmm9 - pxor %xmm7,%xmm12 - - pxor %xmm12,%xmm9 - pxor %xmm12,%xmm8 - - pand %xmm7,%xmm9 - - pxor %xmm9,%xmm13 - pxor %xmm9,%xmm8 - - pand %xmm14,%xmm13 - - pxor %xmm11,%xmm13 - movdqa %xmm4,%xmm11 - movdqa %xmm0,%xmm7 - movdqa %xmm14,%xmm9 - pxor %xmm13,%xmm9 - pand %xmm4,%xmm9 - pxor %xmm0,%xmm4 - pand %xmm14,%xmm0 - pand %xmm13,%xmm4 - pxor %xmm0,%xmm4 - pxor %xmm9,%xmm0 - pxor %xmm1,%xmm11 - pxor %xmm5,%xmm7 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm1,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm5,%xmm1 - pand %xmm14,%xmm7 - pand %xmm12,%xmm5 - pand %xmm13,%xmm11 - pand %xmm8,%xmm1 - pxor %xmm11,%xmm7 - pxor %xmm5,%xmm1 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm5 - pxor %xmm11,%xmm4 - pxor %xmm11,%xmm1 - pxor %xmm7,%xmm0 - pxor %xmm7,%xmm5 - - movdqa %xmm2,%xmm11 - movdqa %xmm6,%xmm7 - pxor %xmm15,%xmm11 - pxor %xmm3,%xmm7 - movdqa %xmm14,%xmm10 - movdqa %xmm12,%xmm9 - pxor %xmm13,%xmm10 - pxor %xmm8,%xmm9 - pand %xmm11,%xmm10 - pand %xmm15,%xmm9 - pxor %xmm7,%xmm11 - pxor %xmm3,%xmm15 - pand %xmm14,%xmm7 - pand %xmm12,%xmm3 - pand %xmm13,%xmm11 - pand %xmm8,%xmm15 - pxor %xmm11,%xmm7 - pxor %xmm3,%xmm15 - pxor %xmm10,%xmm11 - pxor %xmm9,%xmm3 - pxor %xmm12,%xmm14 - pxor %xmm8,%xmm13 - movdqa %xmm14,%xmm10 - pxor %xmm13,%xmm10 - pand %xmm2,%xmm10 - pxor %xmm6,%xmm2 - pand %xmm14,%xmm6 - pand %xmm13,%xmm2 - pxor %xmm6,%xmm2 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm11,%xmm15 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm3 - pxor %xmm6,%xmm0 - pxor %xmm4,%xmm5 - - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm1 - pxor %xmm6,%xmm4 - pxor %xmm1,%xmm3 - pxor %xmm15,%xmm6 - pxor %xmm4,%xmm3 - pxor %xmm5,%xmm2 - pxor %xmm0,%xmm5 - pxor %xmm3,%xmm2 - - pxor %xmm15,%xmm3 - pxor %xmm2,%xmm6 - decl %r10d - jl L$dec_done - - pshufd $0x4E,%xmm15,%xmm7 - pshufd $0x4E,%xmm2,%xmm13 - pxor %xmm15,%xmm7 - pshufd $0x4E,%xmm4,%xmm14 - pxor %xmm2,%xmm13 - pshufd $0x4E,%xmm0,%xmm8 - pxor %xmm4,%xmm14 - pshufd $0x4E,%xmm5,%xmm9 - pxor %xmm0,%xmm8 - pshufd $0x4E,%xmm3,%xmm10 - pxor %xmm5,%xmm9 - pxor %xmm13,%xmm15 - pxor %xmm13,%xmm0 - pshufd $0x4E,%xmm1,%xmm11 - pxor %xmm3,%xmm10 - pxor %xmm7,%xmm5 - pxor %xmm8,%xmm3 - pshufd $0x4E,%xmm6,%xmm12 - pxor %xmm1,%xmm11 - pxor %xmm14,%xmm0 - pxor %xmm9,%xmm1 - pxor %xmm6,%xmm12 - - pxor %xmm14,%xmm5 - pxor %xmm13,%xmm3 - pxor %xmm13,%xmm1 - pxor %xmm10,%xmm6 - pxor %xmm11,%xmm2 - pxor %xmm14,%xmm1 - pxor %xmm14,%xmm6 - pxor %xmm12,%xmm4 - pshufd $0x93,%xmm15,%xmm7 - pshufd $0x93,%xmm0,%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x93,%xmm5,%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x93,%xmm3,%xmm10 - pxor %xmm9,%xmm5 - pshufd $0x93,%xmm1,%xmm11 - pxor %xmm10,%xmm3 - pshufd $0x93,%xmm6,%xmm12 - pxor %xmm11,%xmm1 - pshufd $0x93,%xmm2,%xmm13 - pxor %xmm12,%xmm6 - pshufd $0x93,%xmm4,%xmm14 - pxor %xmm13,%xmm2 - pxor %xmm14,%xmm4 - - pxor %xmm15,%xmm8 - pxor %xmm4,%xmm7 - pxor %xmm4,%xmm8 - pshufd $0x4E,%xmm15,%xmm15 - pxor %xmm0,%xmm9 - pshufd $0x4E,%xmm0,%xmm0 - pxor %xmm1,%xmm12 - pxor %xmm7,%xmm15 - pxor %xmm6,%xmm13 - pxor %xmm8,%xmm0 - pxor %xmm3,%xmm11 - pshufd $0x4E,%xmm1,%xmm7 - pxor %xmm2,%xmm14 - pshufd $0x4E,%xmm6,%xmm8 - pxor %xmm5,%xmm10 - pshufd $0x4E,%xmm3,%xmm1 - pxor %xmm4,%xmm10 - pshufd $0x4E,%xmm4,%xmm6 - pxor %xmm4,%xmm11 - pshufd $0x4E,%xmm2,%xmm3 - pxor %xmm11,%xmm7 - pshufd $0x4E,%xmm5,%xmm2 - pxor %xmm12,%xmm8 - pxor %xmm1,%xmm10 - pxor %xmm14,%xmm6 - pxor %xmm3,%xmm13 - movdqa %xmm7,%xmm3 - pxor %xmm9,%xmm2 - movdqa %xmm13,%xmm5 - movdqa %xmm8,%xmm4 - movdqa %xmm2,%xmm1 - movdqa %xmm10,%xmm2 - movdqa -16(%r11),%xmm7 - jnz L$dec_loop - movdqa -32(%r11),%xmm7 - jmp L$dec_loop -.p2align 4 -L$dec_done: - movdqa 0(%r11),%xmm7 - movdqa 16(%r11),%xmm8 - movdqa %xmm2,%xmm9 - psrlq $1,%xmm2 - movdqa %xmm1,%xmm10 - psrlq $1,%xmm1 - pxor %xmm4,%xmm2 - pxor %xmm6,%xmm1 - pand %xmm7,%xmm2 - pand %xmm7,%xmm1 - pxor %xmm2,%xmm4 - psllq $1,%xmm2 - pxor %xmm1,%xmm6 - psllq $1,%xmm1 - pxor %xmm9,%xmm2 - pxor %xmm10,%xmm1 - movdqa %xmm5,%xmm9 - psrlq $1,%xmm5 - movdqa %xmm15,%xmm10 - psrlq $1,%xmm15 - pxor %xmm3,%xmm5 - pxor %xmm0,%xmm15 - pand %xmm7,%xmm5 - pand %xmm7,%xmm15 - pxor %xmm5,%xmm3 - psllq $1,%xmm5 - pxor %xmm15,%xmm0 - psllq $1,%xmm15 - pxor %xmm9,%xmm5 - pxor %xmm10,%xmm15 - movdqa 32(%r11),%xmm7 - movdqa %xmm6,%xmm9 - psrlq $2,%xmm6 - movdqa %xmm1,%xmm10 - psrlq $2,%xmm1 - pxor %xmm4,%xmm6 - pxor %xmm2,%xmm1 - pand %xmm8,%xmm6 - pand %xmm8,%xmm1 - pxor %xmm6,%xmm4 - psllq $2,%xmm6 - pxor %xmm1,%xmm2 - psllq $2,%xmm1 - pxor %xmm9,%xmm6 - pxor %xmm10,%xmm1 - movdqa %xmm0,%xmm9 - psrlq $2,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $2,%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm15 - pand %xmm8,%xmm0 - pand %xmm8,%xmm15 - pxor %xmm0,%xmm3 - psllq $2,%xmm0 - pxor %xmm15,%xmm5 - psllq $2,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa %xmm3,%xmm9 - psrlq $4,%xmm3 - movdqa %xmm5,%xmm10 - psrlq $4,%xmm5 - pxor %xmm4,%xmm3 - pxor %xmm2,%xmm5 - pand %xmm7,%xmm3 - pand %xmm7,%xmm5 - pxor %xmm3,%xmm4 - psllq $4,%xmm3 - pxor %xmm5,%xmm2 - psllq $4,%xmm5 - pxor %xmm9,%xmm3 - pxor %xmm10,%xmm5 - movdqa %xmm0,%xmm9 - psrlq $4,%xmm0 - movdqa %xmm15,%xmm10 - psrlq $4,%xmm15 - pxor %xmm6,%xmm0 - pxor %xmm1,%xmm15 - pand %xmm7,%xmm0 - pand %xmm7,%xmm15 - pxor %xmm0,%xmm6 - psllq $4,%xmm0 - pxor %xmm15,%xmm1 - psllq $4,%xmm15 - pxor %xmm9,%xmm0 - pxor %xmm10,%xmm15 - movdqa (%rax),%xmm7 - pxor %xmm7,%xmm5 - pxor %xmm7,%xmm3 - pxor %xmm7,%xmm1 - pxor %xmm7,%xmm6 - pxor %xmm7,%xmm2 - pxor %xmm7,%xmm4 - pxor %xmm7,%xmm15 - pxor %xmm7,%xmm0 - .byte 0xf3,0xc3 - - - -.p2align 4 -_bsaes_key_convert: - - leaq L$masks(%rip),%r11 - movdqu (%rcx),%xmm7 - leaq 16(%rcx),%rcx - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - movdqa 48(%r11),%xmm3 - movdqa 64(%r11),%xmm4 - pcmpeqd %xmm5,%xmm5 - - movdqu (%rcx),%xmm6 - movdqa %xmm7,(%rax) - leaq 16(%rax),%rax - decl %r10d - jmp L$key_loop -.p2align 4 -L$key_loop: -.byte 102,15,56,0,244 - - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - - pand %xmm6,%xmm8 - pand %xmm6,%xmm9 - movdqa %xmm2,%xmm10 - pcmpeqb %xmm0,%xmm8 - psllq $4,%xmm0 - movdqa %xmm3,%xmm11 - pcmpeqb %xmm1,%xmm9 - psllq $4,%xmm1 - - pand %xmm6,%xmm10 - pand %xmm6,%xmm11 - movdqa %xmm0,%xmm12 - pcmpeqb %xmm2,%xmm10 - psllq $4,%xmm2 - movdqa %xmm1,%xmm13 - pcmpeqb %xmm3,%xmm11 - psllq $4,%xmm3 - - movdqa %xmm2,%xmm14 - movdqa %xmm3,%xmm15 - pxor %xmm5,%xmm8 - pxor %xmm5,%xmm9 - - pand %xmm6,%xmm12 - pand %xmm6,%xmm13 - movdqa %xmm8,0(%rax) - pcmpeqb %xmm0,%xmm12 - psrlq $4,%xmm0 - movdqa %xmm9,16(%rax) - pcmpeqb %xmm1,%xmm13 - psrlq $4,%xmm1 - leaq 16(%rcx),%rcx - - pand %xmm6,%xmm14 - pand %xmm6,%xmm15 - movdqa %xmm10,32(%rax) - pcmpeqb %xmm2,%xmm14 - psrlq $4,%xmm2 - movdqa %xmm11,48(%rax) - pcmpeqb %xmm3,%xmm15 - psrlq $4,%xmm3 - movdqu (%rcx),%xmm6 - - pxor %xmm5,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm12,64(%rax) - movdqa %xmm13,80(%rax) - movdqa %xmm14,96(%rax) - movdqa %xmm15,112(%rax) - leaq 128(%rax),%rax - decl %r10d - jnz L$key_loop - - movdqa 80(%r11),%xmm7 - - .byte 0xf3,0xc3 - - -.globl _bsaes_cbc_encrypt -.private_extern _bsaes_cbc_encrypt - -.p2align 4 -_bsaes_cbc_encrypt: - - - - - movq %rsp,%rax -L$cbc_dec_prologue: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -72(%rsp),%rsp - - movq %rsp,%rbp - - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movq %r8,%rbx - shrq $4,%r14 - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - movdqu (%rbx),%xmm14 - subq $8,%r14 - jc L$cbc_dec_loop_done - -L$cbc_dec_loop: - movdqu 0(%r12),%xmm15 - movdqu 16(%r12),%xmm0 - movdqu 32(%r12),%xmm1 - movdqu 48(%r12),%xmm2 - movdqu 64(%r12),%xmm3 - movdqu 80(%r12),%xmm4 - movq %rsp,%rax - movdqu 96(%r12),%xmm5 - movl %edx,%r10d - movdqu 112(%r12),%xmm6 - movdqa %xmm14,32(%rbp) - - call _bsaes_decrypt8 - - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm2 - movdqu 112(%r12),%xmm14 - pxor %xmm13,%xmm4 - movdqu %xmm15,0(%r13) - leaq 128(%r12),%r12 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - subq $8,%r14 - jnc L$cbc_dec_loop - -L$cbc_dec_loop_done: - addq $8,%r14 - jz L$cbc_dec_done - - movdqu 0(%r12),%xmm15 - movq %rsp,%rax - movl %edx,%r10d - cmpq $2,%r14 - jb L$cbc_dec_one - movdqu 16(%r12),%xmm0 - je L$cbc_dec_two - movdqu 32(%r12),%xmm1 - cmpq $4,%r14 - jb L$cbc_dec_three - movdqu 48(%r12),%xmm2 - je L$cbc_dec_four - movdqu 64(%r12),%xmm3 - cmpq $6,%r14 - jb L$cbc_dec_five - movdqu 80(%r12),%xmm4 - je L$cbc_dec_six - movdqu 96(%r12),%xmm5 - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm6 - movdqu 96(%r12),%xmm14 - pxor %xmm12,%xmm2 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_six: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm1 - movdqu 80(%r12),%xmm14 - pxor %xmm11,%xmm6 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_five: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm3 - movdqu 64(%r12),%xmm14 - pxor %xmm10,%xmm1 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_four: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm5 - movdqu 48(%r12),%xmm14 - pxor %xmm9,%xmm3 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_three: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm0 - movdqu 32(%r12),%xmm14 - pxor %xmm8,%xmm5 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_two: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm14 - pxor %xmm7,%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_one: - movdqa %xmm14,32(%rbp) - call _bsaes_decrypt8 - pxor 32(%rbp),%xmm15 - movdqu 0(%r12),%xmm14 - movdqu %xmm15,0(%r13) - jmp L$cbc_dec_done - -L$cbc_dec_done: - movdqu %xmm14,(%rbx) - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$cbc_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$cbc_dec_bzero - - leaq 120(%rbp),%rax - - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbx - - movq -8(%rax),%rbp - - leaq (%rax),%rsp - -L$cbc_dec_epilogue: - .byte 0xf3,0xc3 - - - -.globl _bsaes_ctr32_encrypt_blocks -.private_extern _bsaes_ctr32_encrypt_blocks - -.p2align 4 -_bsaes_ctr32_encrypt_blocks: - -#ifndef NDEBUG -#ifndef BORINGSSL_FIPS - - movb $1,_BORINGSSL_function_hit+6(%rip) -#endif -#endif - movq %rsp,%rax -L$ctr_enc_prologue: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -72(%rsp),%rsp - - movq %rsp,%rbp - - movdqu (%r8),%xmm0 - movl 240(%rcx),%eax - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - movdqa %xmm0,32(%rbp) - - - - movl %eax,%ebx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %ebx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - movdqa (%rsp),%xmm8 - leaq L$ADD1(%rip),%r11 - movdqa 32(%rbp),%xmm15 - movdqa -32(%r11),%xmm7 -.byte 102,68,15,56,0,199 -.byte 102,68,15,56,0,255 - movdqa %xmm8,(%rsp) - jmp L$ctr_enc_loop -.p2align 4 -L$ctr_enc_loop: - movdqa %xmm15,32(%rbp) - movdqa %xmm15,%xmm0 - movdqa %xmm15,%xmm1 - paddd 0(%r11),%xmm0 - movdqa %xmm15,%xmm2 - paddd 16(%r11),%xmm1 - movdqa %xmm15,%xmm3 - paddd 32(%r11),%xmm2 - movdqa %xmm15,%xmm4 - paddd 48(%r11),%xmm3 - movdqa %xmm15,%xmm5 - paddd 64(%r11),%xmm4 - movdqa %xmm15,%xmm6 - paddd 80(%r11),%xmm5 - paddd 96(%r11),%xmm6 - - - - movdqa (%rsp),%xmm8 - leaq 16(%rsp),%rax - movdqa -16(%r11),%xmm7 - pxor %xmm8,%xmm15 - pxor %xmm8,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm8,%xmm2 -.byte 102,68,15,56,0,255 -.byte 102,15,56,0,199 - pxor %xmm8,%xmm3 - pxor %xmm8,%xmm4 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - pxor %xmm8,%xmm5 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,223 -.byte 102,15,56,0,231 -.byte 102,15,56,0,239 -.byte 102,15,56,0,247 - leaq L$BS0(%rip),%r11 - movl %ebx,%r10d - - call _bsaes_encrypt8_bitslice - - subq $8,%r14 - jc L$ctr_enc_loop_done - - movdqu 0(%r12),%xmm7 - movdqu 16(%r12),%xmm8 - movdqu 32(%r12),%xmm9 - movdqu 48(%r12),%xmm10 - movdqu 64(%r12),%xmm11 - movdqu 80(%r12),%xmm12 - movdqu 96(%r12),%xmm13 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - pxor %xmm15,%xmm7 - movdqa 32(%rbp),%xmm15 - pxor %xmm8,%xmm0 - movdqu %xmm7,0(%r13) - pxor %xmm9,%xmm3 - movdqu %xmm0,16(%r13) - pxor %xmm10,%xmm5 - movdqu %xmm3,32(%r13) - pxor %xmm11,%xmm2 - movdqu %xmm5,48(%r13) - pxor %xmm12,%xmm6 - movdqu %xmm2,64(%r13) - pxor %xmm13,%xmm1 - movdqu %xmm6,80(%r13) - pxor %xmm14,%xmm4 - movdqu %xmm1,96(%r13) - leaq L$ADD1(%rip),%r11 - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - paddd 112(%r11),%xmm15 - jnz L$ctr_enc_loop - - jmp L$ctr_enc_done -.p2align 4 -L$ctr_enc_loop_done: - addq $8,%r14 - movdqu 0(%r12),%xmm7 - pxor %xmm7,%xmm15 - movdqu %xmm15,0(%r13) - cmpq $2,%r14 - jb L$ctr_enc_done - movdqu 16(%r12),%xmm8 - pxor %xmm8,%xmm0 - movdqu %xmm0,16(%r13) - je L$ctr_enc_done - movdqu 32(%r12),%xmm9 - pxor %xmm9,%xmm3 - movdqu %xmm3,32(%r13) - cmpq $4,%r14 - jb L$ctr_enc_done - movdqu 48(%r12),%xmm10 - pxor %xmm10,%xmm5 - movdqu %xmm5,48(%r13) - je L$ctr_enc_done - movdqu 64(%r12),%xmm11 - pxor %xmm11,%xmm2 - movdqu %xmm2,64(%r13) - cmpq $6,%r14 - jb L$ctr_enc_done - movdqu 80(%r12),%xmm12 - pxor %xmm12,%xmm6 - movdqu %xmm6,80(%r13) - je L$ctr_enc_done - movdqu 96(%r12),%xmm13 - pxor %xmm13,%xmm1 - movdqu %xmm1,96(%r13) - - - -L$ctr_enc_done: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$ctr_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$ctr_enc_bzero - - leaq 120(%rbp),%rax - - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbx - - movq -8(%rax),%rbp - - leaq (%rax),%rsp - -L$ctr_enc_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 6 -_bsaes_const: -L$M0ISR: -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -L$ISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -L$ISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -L$BS0: -.quad 0x5555555555555555, 0x5555555555555555 -L$BS1: -.quad 0x3333333333333333, 0x3333333333333333 -L$BS2: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -L$SR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -L$SRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -L$M0SR: -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -L$SWPUP: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -L$SWPUPM0SR: -.quad 0x0a0d02060c03070b, 0x0004080f05090e01 -L$ADD1: -.quad 0x0000000000000000, 0x0000000100000000 -L$ADD2: -.quad 0x0000000000000000, 0x0000000200000000 -L$ADD3: -.quad 0x0000000000000000, 0x0000000300000000 -L$ADD4: -.quad 0x0000000000000000, 0x0000000400000000 -L$ADD5: -.quad 0x0000000000000000, 0x0000000500000000 -L$ADD6: -.quad 0x0000000000000000, 0x0000000600000000 -L$ADD7: -.quad 0x0000000000000000, 0x0000000700000000 -L$ADD8: -.quad 0x0000000000000000, 0x0000000800000000 -L$xts_magic: -.long 0x87,0,1,0 -L$masks: -.quad 0x0101010101010101, 0x0101010101010101 -.quad 0x0202020202020202, 0x0202020202020202 -.quad 0x0404040404040404, 0x0404040404040404 -.quad 0x0808080808080808, 0x0808080808080808 -L$M0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -L$63: -.quad 0x6363636363636363, 0x6363636363636363 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44,32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32,65,110,100,121,32,80,111,108,121,97,107,111,118,0 -.p2align 6 - -#endif diff --git a/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 85916188..2f60a22c 100644 --- a/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ b/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -121,6 +121,181 @@ L$enc_entry: + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core_2x: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,208 +.byte 102,68,15,56,0,198 + movdqa L$k_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,247 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc2x_entry + +.p2align 4 +L$enc2x_loop: + + movdqa L$k_sb1(%rip),%xmm4 + movdqa L$k_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa L$k_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + +.byte 102,15,56,0,234 +.byte 102,69,15,56,0,232 + movdqa (%r11,%r10,1),%xmm4 + + movdqa L$k_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 +.byte 102,15,56,0,220 +.byte 102,68,15,56,0,220 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +L$enc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 +.byte 102,15,56,0,232 +.byte 102,68,15,56,0,238 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 +.byte 102,15,56,0,217 +.byte 102,68,15,56,0,223 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 +.byte 102,15,56,0,224 +.byte 102,68,15,56,0,230 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 +.byte 102,15,56,0,211 +.byte 102,69,15,56,0,195 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 +.byte 102,15,56,0,220 +.byte 102,69,15,56,0,220 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz L$enc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 +.byte 102,15,56,0,226 +.byte 102,69,15,56,0,224 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 +.byte 102,15,56,0,195 +.byte 102,65,15,56,0,243 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + .byte 0xf3,0xc3 + + + + + + + + + .p2align 4 _vpaes_decrypt_core: @@ -757,6 +932,69 @@ L$cbc_abort: .byte 0xf3,0xc3 +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.p2align 4 +_vpaes_ctr32_encrypt_blocks: + + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz L$ctr32_abort + movdqu (%r8),%xmm0 + movdqa L$ctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb L$rev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz L$ctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz L$ctr32_done + +L$ctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +L$ctr32_loop: + movdqa L$rev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 +.byte 102,15,56,0,193 +.byte 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa L$ctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz L$ctr32_loop + +L$ctr32_done: +L$ctr32_abort: + .byte 0xf3,0xc3 + + @@ -879,6 +1117,17 @@ L$k_dsbe: L$k_dsbo: .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +L$rev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +L$ctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +L$ctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 .p2align 6 @@ -299,7 +299,6 @@ cc_defaults { "linux-x86_64/crypto/fipsmodule/aes-x86_64.S", "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", "linux-x86_64/crypto/fipsmodule/aesni-x86_64.S", - "linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S", "linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S", "linux-x86_64/crypto/fipsmodule/ghash-x86_64.S", "linux-x86_64/crypto/fipsmodule/md5-x86_64.S", @@ -293,7 +293,6 @@ linux_x86_64_sources := \ linux-x86_64/crypto/fipsmodule/aes-x86_64.S\ linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S\ linux-x86_64/crypto/fipsmodule/aesni-x86_64.S\ - linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S\ linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S\ linux-x86_64/crypto/fipsmodule/ghash-x86_64.S\ linux-x86_64/crypto/fipsmodule/md5-x86_64.S\ diff --git a/src/crypto/asn1/a_int.c b/src/crypto/asn1/a_int.c index 6dc18bad..7b483f2d 100644 --- a/src/crypto/asn1/a_int.c +++ b/src/crypto/asn1/a_int.c @@ -195,6 +195,16 @@ ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **a, const unsigned char **pp, unsigned char *to, *s; int i; + /* + * This function can handle lengths up to INT_MAX - 1, but the rest of the + * legacy ASN.1 code mixes integer types, so avoid exposing it to + * ASN1_INTEGERS with larger lengths. + */ + if (len < 0 || len > INT_MAX / 2) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TOO_LONG); + return NULL; + } + if ((a == NULL) || ((*a) == NULL)) { if ((ret = M_ASN1_INTEGER_new()) == NULL) return (NULL); @@ -276,75 +286,6 @@ ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **a, const unsigned char **pp, return (NULL); } -/* - * This is a version of d2i_ASN1_INTEGER that ignores the sign bit of ASN1 - * integers: some broken software can encode a positive INTEGER with its MSB - * set as negative (it doesn't add a padding zero). - */ - -ASN1_INTEGER *d2i_ASN1_UINTEGER(ASN1_INTEGER **a, const unsigned char **pp, - long length) -{ - ASN1_INTEGER *ret = NULL; - const unsigned char *p; - unsigned char *s; - long len; - int inf, tag, xclass; - int i; - - if ((a == NULL) || ((*a) == NULL)) { - if ((ret = M_ASN1_INTEGER_new()) == NULL) - return (NULL); - ret->type = V_ASN1_INTEGER; - } else - ret = (*a); - - p = *pp; - inf = ASN1_get_object(&p, &len, &tag, &xclass, length); - if (inf & 0x80) { - i = ASN1_R_BAD_OBJECT_HEADER; - goto err; - } - - if (tag != V_ASN1_INTEGER) { - i = ASN1_R_EXPECTING_AN_INTEGER; - goto err; - } - - /* - * We must OPENSSL_malloc stuff, even for 0 bytes otherwise it signifies - * a missing NULL parameter. - */ - s = (unsigned char *)OPENSSL_malloc((int)len + 1); - if (s == NULL) { - i = ERR_R_MALLOC_FAILURE; - goto err; - } - ret->type = V_ASN1_INTEGER; - if (len) { - if ((*p == 0) && (len != 1)) { - p++; - len--; - } - OPENSSL_memcpy(s, p, (int)len); - p += len; - } - - if (ret->data != NULL) - OPENSSL_free(ret->data); - ret->data = s; - ret->length = (int)len; - if (a != NULL) - (*a) = ret; - *pp = p; - return (ret); - err: - OPENSSL_PUT_ERROR(ASN1, i); - if ((ret != NULL) && ((a == NULL) || (*a != ret))) - M_ASN1_INTEGER_free(ret); - return (NULL); -} - int ASN1_INTEGER_set(ASN1_INTEGER *a, long v) { if (v >= 0) { diff --git a/src/crypto/asn1/asn1_lib.c b/src/crypto/asn1/asn1_lib.c index ea727f33..8526aba3 100644 --- a/src/crypto/asn1/asn1_lib.c +++ b/src/crypto/asn1/asn1_lib.c @@ -205,7 +205,11 @@ static int asn1_get_length(const unsigned char **pp, int *inf, long *rl, } else ret = i; } - if (ret > LONG_MAX) + /* + * Bound the length to comfortably fit in an int. Lengths in this module + * often switch between int and long without overflow checks. + */ + if (ret > INT_MAX / 2) return 0; *pp = p; *rl = (long)ret; diff --git a/src/crypto/cipher_extra/e_aesccm.c b/src/crypto/cipher_extra/e_aesccm.c index 4e6668c0..3e186593 100644 --- a/src/crypto/cipher_extra/e_aesccm.c +++ b/src/crypto/cipher_extra/e_aesccm.c @@ -66,8 +66,7 @@ static int aead_aes_ccm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, struct aead_aes_ccm_ctx *ccm_ctx = (struct aead_aes_ccm_ctx *)&ctx->state; block128_f block; - ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len, - 0 /* small inputs */); + ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len); ctx->tag_len = tag_len; if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, M, L)) { OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR); diff --git a/src/crypto/cipher_extra/e_aesctrhmac.c b/src/crypto/cipher_extra/e_aesctrhmac.c index 0834bd1d..8c45c811 100644 --- a/src/crypto/cipher_extra/e_aesctrhmac.c +++ b/src/crypto/cipher_extra/e_aesctrhmac.c @@ -94,8 +94,8 @@ static int aead_aes_ctr_hmac_sha256_init(EVP_AEAD_CTX *ctx, const uint8_t *key, return 0; } - aes_ctx->ctr = aes_ctr_set_key(&aes_ctx->ks.ks, NULL, &aes_ctx->block, key, - aes_key_len, 1 /* large inputs */); + aes_ctx->ctr = + aes_ctr_set_key(&aes_ctx->ks.ks, NULL, &aes_ctx->block, key, aes_key_len); ctx->tag_len = tag_len; hmac_init(&aes_ctx->inner_init_state, &aes_ctx->outer_init_state, key + aes_key_len); diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c index fb08a428..71a71fac 100644 --- a/src/crypto/cipher_extra/e_aesgcmsiv.c +++ b/src/crypto/cipher_extra/e_aesgcmsiv.c @@ -595,7 +595,7 @@ static int aead_aes_gcm_siv_init(EVP_AEAD_CTX *ctx, const uint8_t *key, OPENSSL_memset(gcm_siv_ctx, 0, sizeof(struct aead_aes_gcm_siv_ctx)); aes_ctr_set_key(&gcm_siv_ctx->ks.ks, NULL, &gcm_siv_ctx->kgk_block, key, - key_len, 0 /* small inputs */); + key_len); gcm_siv_ctx->is_256 = (key_len == 32); ctx->tag_len = tag_len; @@ -719,8 +719,7 @@ static void gcm_siv_keys( OPENSSL_memcpy(out_keys->auth_key, key_material, 16); aes_ctr_set_key(&out_keys->enc_key.ks, NULL, &out_keys->enc_block, - key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16, - 0 /* small inputs */); + key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16); } static int aead_aes_gcm_siv_seal_scatter( diff --git a/src/crypto/fipsmodule/CMakeLists.txt b/src/crypto/fipsmodule/CMakeLists.txt index fbf25ac8..d1e2cb9d 100644 --- a/src/crypto/fipsmodule/CMakeLists.txt +++ b/src/crypto/fipsmodule/CMakeLists.txt @@ -7,7 +7,6 @@ if(${ARCH} STREQUAL "x86_64") aesni-gcm-x86_64.${ASM_EXT} aesni-x86_64.${ASM_EXT} aes-x86_64.${ASM_EXT} - bsaes-x86_64.${ASM_EXT} ghash-ssse3-x86_64.${ASM_EXT} ghash-x86_64.${ASM_EXT} md5-x86_64.${ASM_EXT} @@ -95,7 +94,6 @@ perlasm(armv4-mont.${ASM_EXT} bn/asm/armv4-mont.pl) perlasm(armv8-mont.${ASM_EXT} bn/asm/armv8-mont.pl) perlasm(bn-586.${ASM_EXT} bn/asm/bn-586.pl) perlasm(bsaes-armv7.${ASM_EXT} aes/asm/bsaes-armv7.pl) -perlasm(bsaes-x86_64.${ASM_EXT} aes/asm/bsaes-x86_64.pl) perlasm(co-586.${ASM_EXT} bn/asm/co-586.pl) perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl) perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl) diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl deleted file mode 100644 index 3bb28190..00000000 --- a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl +++ /dev/null @@ -1,3227 +0,0 @@ -#! /usr/bin/env perl -# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. -# -# Licensed under the OpenSSL license (the "License"). You may not use -# this file except in compliance with the License. You can obtain a copy -# in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html - - -################################################################### -### AES-128 [originally in CTR mode] ### -### bitsliced implementation for Intel Core 2 processors ### -### requires support of SSE extensions up to SSSE3 ### -### Author: Emilia Käsper and Peter Schwabe ### -### Date: 2009-03-19 ### -### Public domain ### -### ### -### See http://homes.esat.kuleuven.be/~ekasper/#software for ### -### further information. ### -################################################################### -# -# September 2011. -# -# Started as transliteration to "perlasm" the original code has -# undergone following changes: -# -# - code was made position-independent; -# - rounds were folded into a loop resulting in >5x size reduction -# from 12.5KB to 2.2KB; -# - above was possibile thanks to mixcolumns() modification that -# allowed to feed its output back to aesenc[last], this was -# achieved at cost of two additional inter-registers moves; -# - some instruction reordering and interleaving; -# - this module doesn't implement key setup subroutine, instead it -# relies on conversion of "conventional" key schedule as returned -# by AES_set_encrypt_key (see discussion below); -# - first and last round keys are treated differently, which allowed -# to skip one shiftrows(), reduce bit-sliced key schedule and -# speed-up conversion by 22%; -# - support for 192- and 256-bit keys was added; -# -# Resulting performance in CPU cycles spent to encrypt one byte out -# of 4096-byte buffer with 128-bit key is: -# -# Emilia's this(*) difference -# -# Core 2 9.30 8.69 +7% -# Nehalem(**) 7.63 6.88 +11% -# Atom 17.1 16.4 +4% -# Silvermont - 12.9 -# Goldmont - 8.85 -# -# (*) Comparison is not completely fair, because "this" is ECB, -# i.e. no extra processing such as counter values calculation -# and xor-ing input as in Emilia's CTR implementation is -# performed. However, the CTR calculations stand for not more -# than 1% of total time, so comparison is *rather* fair. -# -# (**) Results were collected on Westmere, which is considered to -# be equivalent to Nehalem for this code. -# -# As for key schedule conversion subroutine. Interface to OpenSSL -# relies on per-invocation on-the-fly conversion. This naturally -# has impact on performance, especially for short inputs. Conversion -# time in CPU cycles and its ratio to CPU cycles spent in 8x block -# function is: -# -# conversion conversion/8x block -# Core 2 240 0.22 -# Nehalem 180 0.20 -# Atom 430 0.20 -# -# The ratio values mean that 128-byte blocks will be processed -# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, -# etc. Then keep in mind that input sizes not divisible by 128 are -# *effectively* slower, especially shortest ones, e.g. consecutive -# 144-byte blocks are processed 44% slower than one would expect, -# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" -# it's still faster than ["hyper-threading-safe" code path in] -# aes-x86_64.pl on all lengths above 64 bytes... -# -# October 2011. -# -# Add decryption procedure. Performance in CPU cycles spent to decrypt -# one byte out of 4096-byte buffer with 128-bit key is: -# -# Core 2 9.98 -# Nehalem 7.80 -# Atom 17.9 -# Silvermont 14.0 -# Goldmont 10.2 -# -# November 2011. -# -# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is -# suboptimal, but XTS is meant to be used with larger blocks... -# -# <appro@openssl.org> - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; -*STDOUT=*OUT; - -my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); -my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) -my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... -my $xts=0; # Also patch out the XTS subroutines. - -{ -my ($key,$rounds,$const)=("%rax","%r10d","%r11"); - -sub Sbox { -# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb -my @b=@_[0..7]; -my @t=@_[8..11]; -my @s=@_[12..15]; - &InBasisChange (@b); - &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); - &OutBasisChange (@b[7,1,4,2,6,5,0,3]); -} - -sub InBasisChange { -# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb -my @b=@_[0..7]; -$code.=<<___; - pxor @b[6], @b[5] - pxor @b[1], @b[2] - pxor @b[0], @b[3] - pxor @b[2], @b[6] - pxor @b[0], @b[5] - - pxor @b[3], @b[6] - pxor @b[7], @b[3] - pxor @b[5], @b[7] - pxor @b[4], @b[3] - pxor @b[5], @b[4] - pxor @b[1], @b[3] - - pxor @b[7], @b[2] - pxor @b[5], @b[1] -___ -} - -sub OutBasisChange { -# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb -my @b=@_[0..7]; -$code.=<<___; - pxor @b[6], @b[0] - pxor @b[4], @b[1] - pxor @b[0], @b[2] - pxor @b[6], @b[4] - pxor @b[1], @b[6] - - pxor @b[5], @b[1] - pxor @b[3], @b[5] - pxor @b[7], @b[3] - pxor @b[5], @b[7] - pxor @b[5], @b[2] - - pxor @b[7], @b[4] -___ -} - -sub InvSbox { -# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb -# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb -my @b=@_[0..7]; -my @t=@_[8..11]; -my @s=@_[12..15]; - &InvInBasisChange (@b); - &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); - &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); -} - -sub InvInBasisChange { # OutBasisChange in reverse -my @b=@_[5,1,2,6,3,7,0,4]; -$code.=<<___ - pxor @b[7], @b[4] - - pxor @b[5], @b[7] - pxor @b[5], @b[2] - pxor @b[7], @b[3] - pxor @b[3], @b[5] - pxor @b[5], @b[1] - - pxor @b[1], @b[6] - pxor @b[0], @b[2] - pxor @b[6], @b[4] - pxor @b[6], @b[0] - pxor @b[4], @b[1] -___ -} - -sub InvOutBasisChange { # InBasisChange in reverse -my @b=@_[2,5,7,3,6,1,0,4]; -$code.=<<___; - pxor @b[5], @b[1] - pxor @b[7], @b[2] - - pxor @b[1], @b[3] - pxor @b[5], @b[4] - pxor @b[5], @b[7] - pxor @b[4], @b[3] - pxor @b[0], @b[5] - pxor @b[7], @b[3] - pxor @b[2], @b[6] - pxor @b[1], @b[2] - pxor @b[3], @b[6] - - pxor @b[0], @b[3] - pxor @b[6], @b[5] -___ -} - -sub Mul_GF4 { -#;************************************************************* -#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * -#;************************************************************* -my ($x0,$x1,$y0,$y1,$t0)=@_; -$code.=<<___; - movdqa $y0, $t0 - pxor $y1, $t0 - pand $x0, $t0 - pxor $x1, $x0 - pand $y0, $x1 - pand $y1, $x0 - pxor $x1, $x0 - pxor $t0, $x1 -___ -} - -sub Mul_GF4_N { # not used, see next subroutine -# multiply and scale by N -my ($x0,$x1,$y0,$y1,$t0)=@_; -$code.=<<___; - movdqa $y0, $t0 - pxor $y1, $t0 - pand $x0, $t0 - pxor $x1, $x0 - pand $y0, $x1 - pand $y1, $x0 - pxor $x0, $x1 - pxor $t0, $x0 -___ -} - -sub Mul_GF4_N_GF4 { -# interleaved Mul_GF4_N and Mul_GF4 -my ($x0,$x1,$y0,$y1,$t0, - $x2,$x3,$y2,$y3,$t1)=@_; -$code.=<<___; - movdqa $y0, $t0 - movdqa $y2, $t1 - pxor $y1, $t0 - pxor $y3, $t1 - pand $x0, $t0 - pand $x2, $t1 - pxor $x1, $x0 - pxor $x3, $x2 - pand $y0, $x1 - pand $y2, $x3 - pand $y1, $x0 - pand $y3, $x2 - pxor $x0, $x1 - pxor $x3, $x2 - pxor $t0, $x0 - pxor $t1, $x3 -___ -} -sub Mul_GF16_2 { -my @x=@_[0..7]; -my @y=@_[8..11]; -my @t=@_[12..15]; -$code.=<<___; - movdqa @x[0], @t[0] - movdqa @x[1], @t[1] -___ - &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); -$code.=<<___; - pxor @x[2], @t[0] - pxor @x[3], @t[1] - pxor @y[2], @y[0] - pxor @y[3], @y[1] -___ - Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], - @x[2], @x[3], @y[2], @y[3], @t[2]); -$code.=<<___; - pxor @t[0], @x[0] - pxor @t[0], @x[2] - pxor @t[1], @x[1] - pxor @t[1], @x[3] - - movdqa @x[4], @t[0] - movdqa @x[5], @t[1] - pxor @x[6], @t[0] - pxor @x[7], @t[1] -___ - &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], - @x[6], @x[7], @y[2], @y[3], @t[2]); -$code.=<<___; - pxor @y[2], @y[0] - pxor @y[3], @y[1] -___ - &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); -$code.=<<___; - pxor @t[0], @x[4] - pxor @t[0], @x[6] - pxor @t[1], @x[5] - pxor @t[1], @x[7] -___ -} -sub Inv_GF256 { -#;******************************************************************** -#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * -#;******************************************************************** -my @x=@_[0..7]; -my @t=@_[8..11]; -my @s=@_[12..15]; -# direct optimizations from hardware -$code.=<<___; - movdqa @x[4], @t[3] - movdqa @x[5], @t[2] - movdqa @x[1], @t[1] - movdqa @x[7], @s[1] - movdqa @x[0], @s[0] - - pxor @x[6], @t[3] - pxor @x[7], @t[2] - pxor @x[3], @t[1] - movdqa @t[3], @s[2] - pxor @x[6], @s[1] - movdqa @t[2], @t[0] - pxor @x[2], @s[0] - movdqa @t[3], @s[3] - - por @t[1], @t[2] - por @s[0], @t[3] - pxor @t[0], @s[3] - pand @s[0], @s[2] - pxor @t[1], @s[0] - pand @t[1], @t[0] - pand @s[0], @s[3] - movdqa @x[3], @s[0] - pxor @x[2], @s[0] - pand @s[0], @s[1] - pxor @s[1], @t[3] - pxor @s[1], @t[2] - movdqa @x[4], @s[1] - movdqa @x[1], @s[0] - pxor @x[5], @s[1] - pxor @x[0], @s[0] - movdqa @s[1], @t[1] - pand @s[0], @s[1] - por @s[0], @t[1] - pxor @s[1], @t[0] - pxor @s[3], @t[3] - pxor @s[2], @t[2] - pxor @s[3], @t[1] - movdqa @x[7], @s[0] - pxor @s[2], @t[0] - movdqa @x[6], @s[1] - pxor @s[2], @t[1] - movdqa @x[5], @s[2] - pand @x[3], @s[0] - movdqa @x[4], @s[3] - pand @x[2], @s[1] - pand @x[1], @s[2] - por @x[0], @s[3] - pxor @s[0], @t[3] - pxor @s[1], @t[2] - pxor @s[2], @t[1] - pxor @s[3], @t[0] - - #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 - - # new smaller inversion - - movdqa @t[3], @s[0] - pand @t[1], @t[3] - pxor @t[2], @s[0] - - movdqa @t[0], @s[2] - movdqa @s[0], @s[3] - pxor @t[3], @s[2] - pand @s[2], @s[3] - - movdqa @t[1], @s[1] - pxor @t[2], @s[3] - pxor @t[0], @s[1] - - pxor @t[2], @t[3] - - pand @t[3], @s[1] - - movdqa @s[2], @t[2] - pxor @t[0], @s[1] - - pxor @s[1], @t[2] - pxor @s[1], @t[1] - - pand @t[0], @t[2] - - pxor @t[2], @s[2] - pxor @t[2], @t[1] - - pand @s[3], @s[2] - - pxor @s[0], @s[2] -___ -# output in s3, s2, s1, t1 - -# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 - -# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 - &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); - -### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb -} - -# AES linear components - -sub ShiftRows { -my @x=@_[0..7]; -my $mask=pop; -$code.=<<___; - pxor 0x00($key),@x[0] - pxor 0x10($key),@x[1] - pxor 0x20($key),@x[2] - pxor 0x30($key),@x[3] - pshufb $mask,@x[0] - pshufb $mask,@x[1] - pxor 0x40($key),@x[4] - pxor 0x50($key),@x[5] - pshufb $mask,@x[2] - pshufb $mask,@x[3] - pxor 0x60($key),@x[6] - pxor 0x70($key),@x[7] - pshufb $mask,@x[4] - pshufb $mask,@x[5] - pshufb $mask,@x[6] - pshufb $mask,@x[7] - lea 0x80($key),$key -___ -} - -sub MixColumns { -# modified to emit output in order suitable for feeding back to aesenc[last] -my @x=@_[0..7]; -my @t=@_[8..15]; -my $inv=@_[16]; # optional -$code.=<<___; - pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 - pshufd \$0x93, @x[1], @t[1] - pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) - pshufd \$0x93, @x[2], @t[2] - pxor @t[1], @x[1] - pshufd \$0x93, @x[3], @t[3] - pxor @t[2], @x[2] - pshufd \$0x93, @x[4], @t[4] - pxor @t[3], @x[3] - pshufd \$0x93, @x[5], @t[5] - pxor @t[4], @x[4] - pshufd \$0x93, @x[6], @t[6] - pxor @t[5], @x[5] - pshufd \$0x93, @x[7], @t[7] - pxor @t[6], @x[6] - pxor @t[7], @x[7] - - pxor @x[0], @t[1] - pxor @x[7], @t[0] - pxor @x[7], @t[1] - pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) - pxor @x[1], @t[2] - pshufd \$0x4E, @x[1], @x[1] - pxor @x[4], @t[5] - pxor @t[0], @x[0] - pxor @x[5], @t[6] - pxor @t[1], @x[1] - pxor @x[3], @t[4] - pshufd \$0x4E, @x[4], @t[0] - pxor @x[6], @t[7] - pshufd \$0x4E, @x[5], @t[1] - pxor @x[2], @t[3] - pshufd \$0x4E, @x[3], @x[4] - pxor @x[7], @t[3] - pshufd \$0x4E, @x[7], @x[5] - pxor @x[7], @t[4] - pshufd \$0x4E, @x[6], @x[3] - pxor @t[4], @t[0] - pshufd \$0x4E, @x[2], @x[6] - pxor @t[5], @t[1] -___ -$code.=<<___ if (!$inv); - pxor @t[3], @x[4] - pxor @t[7], @x[5] - pxor @t[6], @x[3] - movdqa @t[0], @x[2] - pxor @t[2], @x[6] - movdqa @t[1], @x[7] -___ -$code.=<<___ if ($inv); - pxor @x[4], @t[3] - pxor @t[7], @x[5] - pxor @x[3], @t[6] - movdqa @t[0], @x[3] - pxor @t[2], @x[6] - movdqa @t[6], @x[2] - movdqa @t[1], @x[7] - movdqa @x[6], @x[4] - movdqa @t[3], @x[6] -___ -} - -sub InvMixColumns_orig { -my @x=@_[0..7]; -my @t=@_[8..15]; - -$code.=<<___; - # multiplication by 0x0e - pshufd \$0x93, @x[7], @t[7] - movdqa @x[2], @t[2] - pxor @x[5], @x[7] # 7 5 - pxor @x[5], @x[2] # 2 5 - pshufd \$0x93, @x[0], @t[0] - movdqa @x[5], @t[5] - pxor @x[0], @x[5] # 5 0 [1] - pxor @x[1], @x[0] # 0 1 - pshufd \$0x93, @x[1], @t[1] - pxor @x[2], @x[1] # 1 25 - pxor @x[6], @x[0] # 01 6 [2] - pxor @x[3], @x[1] # 125 3 [4] - pshufd \$0x93, @x[3], @t[3] - pxor @x[0], @x[2] # 25 016 [3] - pxor @x[7], @x[3] # 3 75 - pxor @x[6], @x[7] # 75 6 [0] - pshufd \$0x93, @x[6], @t[6] - movdqa @x[4], @t[4] - pxor @x[4], @x[6] # 6 4 - pxor @x[3], @x[4] # 4 375 [6] - pxor @x[7], @x[3] # 375 756=36 - pxor @t[5], @x[6] # 64 5 [7] - pxor @t[2], @x[3] # 36 2 - pxor @t[4], @x[3] # 362 4 [5] - pshufd \$0x93, @t[5], @t[5] -___ - my @y = @x[7,5,0,2,1,3,4,6]; -$code.=<<___; - # multiplication by 0x0b - pxor @y[0], @y[1] - pxor @t[0], @y[0] - pxor @t[1], @y[1] - pshufd \$0x93, @t[2], @t[2] - pxor @t[5], @y[0] - pxor @t[6], @y[1] - pxor @t[7], @y[0] - pshufd \$0x93, @t[4], @t[4] - pxor @t[6], @t[7] # clobber t[7] - pxor @y[0], @y[1] - - pxor @t[0], @y[3] - pshufd \$0x93, @t[0], @t[0] - pxor @t[1], @y[2] - pxor @t[1], @y[4] - pxor @t[2], @y[2] - pshufd \$0x93, @t[1], @t[1] - pxor @t[2], @y[3] - pxor @t[2], @y[5] - pxor @t[7], @y[2] - pshufd \$0x93, @t[2], @t[2] - pxor @t[3], @y[3] - pxor @t[3], @y[6] - pxor @t[3], @y[4] - pshufd \$0x93, @t[3], @t[3] - pxor @t[4], @y[7] - pxor @t[4], @y[5] - pxor @t[7], @y[7] - pxor @t[5], @y[3] - pxor @t[4], @y[4] - pxor @t[5], @t[7] # clobber t[7] even more - - pxor @t[7], @y[5] - pshufd \$0x93, @t[4], @t[4] - pxor @t[7], @y[6] - pxor @t[7], @y[4] - - pxor @t[5], @t[7] - pshufd \$0x93, @t[5], @t[5] - pxor @t[6], @t[7] # restore t[7] - - # multiplication by 0x0d - pxor @y[7], @y[4] - pxor @t[4], @y[7] - pshufd \$0x93, @t[6], @t[6] - pxor @t[0], @y[2] - pxor @t[5], @y[7] - pxor @t[2], @y[2] - pshufd \$0x93, @t[7], @t[7] - - pxor @y[1], @y[3] - pxor @t[1], @y[1] - pxor @t[0], @y[0] - pxor @t[0], @y[3] - pxor @t[5], @y[1] - pxor @t[5], @y[0] - pxor @t[7], @y[1] - pshufd \$0x93, @t[0], @t[0] - pxor @t[6], @y[0] - pxor @y[1], @y[3] - pxor @t[1], @y[4] - pshufd \$0x93, @t[1], @t[1] - - pxor @t[7], @y[7] - pxor @t[2], @y[4] - pxor @t[2], @y[5] - pshufd \$0x93, @t[2], @t[2] - pxor @t[6], @y[2] - pxor @t[3], @t[6] # clobber t[6] - pxor @y[7], @y[4] - pxor @t[6], @y[3] - - pxor @t[6], @y[6] - pxor @t[5], @y[5] - pxor @t[4], @y[6] - pshufd \$0x93, @t[4], @t[4] - pxor @t[6], @y[5] - pxor @t[7], @y[6] - pxor @t[3], @t[6] # restore t[6] - - pshufd \$0x93, @t[5], @t[5] - pshufd \$0x93, @t[6], @t[6] - pshufd \$0x93, @t[7], @t[7] - pshufd \$0x93, @t[3], @t[3] - - # multiplication by 0x09 - pxor @y[1], @y[4] - pxor @y[1], @t[1] # t[1]=y[1] - pxor @t[5], @t[0] # clobber t[0] - pxor @t[5], @t[1] - pxor @t[0], @y[3] - pxor @y[0], @t[0] # t[0]=y[0] - pxor @t[6], @t[1] - pxor @t[7], @t[6] # clobber t[6] - pxor @t[1], @y[4] - pxor @t[4], @y[7] - pxor @y[4], @t[4] # t[4]=y[4] - pxor @t[3], @y[6] - pxor @y[3], @t[3] # t[3]=y[3] - pxor @t[2], @y[5] - pxor @y[2], @t[2] # t[2]=y[2] - pxor @t[7], @t[3] - pxor @y[5], @t[5] # t[5]=y[5] - pxor @t[6], @t[2] - pxor @t[6], @t[5] - pxor @y[6], @t[6] # t[6]=y[6] - pxor @y[7], @t[7] # t[7]=y[7] - - movdqa @t[0],@XMM[0] - movdqa @t[1],@XMM[1] - movdqa @t[2],@XMM[2] - movdqa @t[3],@XMM[3] - movdqa @t[4],@XMM[4] - movdqa @t[5],@XMM[5] - movdqa @t[6],@XMM[6] - movdqa @t[7],@XMM[7] -___ -} - -sub InvMixColumns { -my @x=@_[0..7]; -my @t=@_[8..15]; - -# Thanks to Jussi Kivilinna for providing pointer to -# -# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | -# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | -# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | -# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | - -$code.=<<___; - # multiplication by 0x05-0x00-0x04-0x00 - pshufd \$0x4E, @x[0], @t[0] - pshufd \$0x4E, @x[6], @t[6] - pxor @x[0], @t[0] - pshufd \$0x4E, @x[7], @t[7] - pxor @x[6], @t[6] - pshufd \$0x4E, @x[1], @t[1] - pxor @x[7], @t[7] - pshufd \$0x4E, @x[2], @t[2] - pxor @x[1], @t[1] - pshufd \$0x4E, @x[3], @t[3] - pxor @x[2], @t[2] - pxor @t[6], @x[0] - pxor @t[6], @x[1] - pshufd \$0x4E, @x[4], @t[4] - pxor @x[3], @t[3] - pxor @t[0], @x[2] - pxor @t[1], @x[3] - pshufd \$0x4E, @x[5], @t[5] - pxor @x[4], @t[4] - pxor @t[7], @x[1] - pxor @t[2], @x[4] - pxor @x[5], @t[5] - - pxor @t[7], @x[2] - pxor @t[6], @x[3] - pxor @t[6], @x[4] - pxor @t[3], @x[5] - pxor @t[4], @x[6] - pxor @t[7], @x[4] - pxor @t[7], @x[5] - pxor @t[5], @x[7] -___ - &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 -} - -sub aesenc { # not used -my @b=@_[0..7]; -my @t=@_[8..15]; -$code.=<<___; - movdqa 0x30($const),@t[0] # .LSR -___ - &ShiftRows (@b,@t[0]); - &Sbox (@b,@t); - &MixColumns (@b[0,1,4,6,3,7,2,5],@t); -} - -sub aesenclast { # not used -my @b=@_[0..7]; -my @t=@_[8..15]; -$code.=<<___; - movdqa 0x40($const),@t[0] # .LSRM0 -___ - &ShiftRows (@b,@t[0]); - &Sbox (@b,@t); -$code.=<<___ - pxor 0x00($key),@b[0] - pxor 0x10($key),@b[1] - pxor 0x20($key),@b[4] - pxor 0x30($key),@b[6] - pxor 0x40($key),@b[3] - pxor 0x50($key),@b[7] - pxor 0x60($key),@b[2] - pxor 0x70($key),@b[5] -___ -} - -sub swapmove { -my ($a,$b,$n,$mask,$t)=@_; -$code.=<<___; - movdqa $b,$t - psrlq \$$n,$b - pxor $a,$b - pand $mask,$b - pxor $b,$a - psllq \$$n,$b - pxor $t,$b -___ -} -sub swapmove2x { -my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; -$code.=<<___; - movdqa $b0,$t0 - psrlq \$$n,$b0 - movdqa $b1,$t1 - psrlq \$$n,$b1 - pxor $a0,$b0 - pxor $a1,$b1 - pand $mask,$b0 - pand $mask,$b1 - pxor $b0,$a0 - psllq \$$n,$b0 - pxor $b1,$a1 - psllq \$$n,$b1 - pxor $t0,$b0 - pxor $t1,$b1 -___ -} - -sub bitslice { -my @x=reverse(@_[0..7]); -my ($t0,$t1,$t2,$t3)=@_[8..11]; -$code.=<<___; - movdqa 0x00($const),$t0 # .LBS0 - movdqa 0x10($const),$t1 # .LBS1 -___ - &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); - &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); -$code.=<<___; - movdqa 0x20($const),$t0 # .LBS2 -___ - &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); - &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); - - &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); - &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); -} - -$code.=<<___; -.text - -.type _bsaes_encrypt8,\@abi-omnipotent -.align 64 -_bsaes_encrypt8: -.cfi_startproc - lea .LBS0(%rip), $const # constants table - - movdqa ($key), @XMM[9] # round 0 key - lea 0x10($key), $key - movdqa 0x50($const), @XMM[8] # .LM0SR - pxor @XMM[9], @XMM[0] # xor with round0 key - pxor @XMM[9], @XMM[1] - pxor @XMM[9], @XMM[2] - pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[0] - pshufb @XMM[8], @XMM[1] - pxor @XMM[9], @XMM[4] - pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[2] - pshufb @XMM[8], @XMM[3] - pxor @XMM[9], @XMM[6] - pxor @XMM[9], @XMM[7] - pshufb @XMM[8], @XMM[4] - pshufb @XMM[8], @XMM[5] - pshufb @XMM[8], @XMM[6] - pshufb @XMM[8], @XMM[7] -_bsaes_encrypt8_bitslice: -___ - &bitslice (@XMM[0..7, 8..11]); -$code.=<<___; - dec $rounds - jmp .Lenc_sbox -.align 16 -.Lenc_loop: -___ - &ShiftRows (@XMM[0..7, 8]); -$code.=".Lenc_sbox:\n"; - &Sbox (@XMM[0..7, 8..15]); -$code.=<<___; - dec $rounds - jl .Lenc_done -___ - &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); -$code.=<<___; - movdqa 0x30($const), @XMM[8] # .LSR - jnz .Lenc_loop - movdqa 0x40($const), @XMM[8] # .LSRM0 - jmp .Lenc_loop -.align 16 -.Lenc_done: -___ - # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb - &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); -$code.=<<___; - movdqa ($key), @XMM[8] # last round key - pxor @XMM[8], @XMM[4] - pxor @XMM[8], @XMM[6] - pxor @XMM[8], @XMM[3] - pxor @XMM[8], @XMM[7] - pxor @XMM[8], @XMM[2] - pxor @XMM[8], @XMM[5] - pxor @XMM[8], @XMM[0] - pxor @XMM[8], @XMM[1] - ret -.cfi_endproc -.size _bsaes_encrypt8,.-_bsaes_encrypt8 - -.type _bsaes_decrypt8,\@abi-omnipotent -.align 64 -_bsaes_decrypt8: -.cfi_startproc - lea .LBS0(%rip), $const # constants table - - movdqa ($key), @XMM[9] # round 0 key - lea 0x10($key), $key - movdqa -0x30($const), @XMM[8] # .LM0ISR - pxor @XMM[9], @XMM[0] # xor with round0 key - pxor @XMM[9], @XMM[1] - pxor @XMM[9], @XMM[2] - pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[0] - pshufb @XMM[8], @XMM[1] - pxor @XMM[9], @XMM[4] - pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[2] - pshufb @XMM[8], @XMM[3] - pxor @XMM[9], @XMM[6] - pxor @XMM[9], @XMM[7] - pshufb @XMM[8], @XMM[4] - pshufb @XMM[8], @XMM[5] - pshufb @XMM[8], @XMM[6] - pshufb @XMM[8], @XMM[7] -___ - &bitslice (@XMM[0..7, 8..11]); -$code.=<<___; - dec $rounds - jmp .Ldec_sbox -.align 16 -.Ldec_loop: -___ - &ShiftRows (@XMM[0..7, 8]); -$code.=".Ldec_sbox:\n"; - &InvSbox (@XMM[0..7, 8..15]); -$code.=<<___; - dec $rounds - jl .Ldec_done -___ - &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); -$code.=<<___; - movdqa -0x10($const), @XMM[8] # .LISR - jnz .Ldec_loop - movdqa -0x20($const), @XMM[8] # .LISRM0 - jmp .Ldec_loop -.align 16 -.Ldec_done: -___ - &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); -$code.=<<___; - movdqa ($key), @XMM[8] # last round key - pxor @XMM[8], @XMM[6] - pxor @XMM[8], @XMM[4] - pxor @XMM[8], @XMM[2] - pxor @XMM[8], @XMM[7] - pxor @XMM[8], @XMM[3] - pxor @XMM[8], @XMM[5] - pxor @XMM[8], @XMM[0] - pxor @XMM[8], @XMM[1] - ret -.cfi_endproc -.size _bsaes_decrypt8,.-_bsaes_decrypt8 -___ -} -{ -my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); - -sub bitslice_key { -my @x=reverse(@_[0..7]); -my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; - - &swapmove (@x[0,1],1,$bs0,$t2,$t3); -$code.=<<___; - #&swapmove(@x[2,3],1,$t0,$t2,$t3); - movdqa @x[0], @x[2] - movdqa @x[1], @x[3] -___ - #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); - - &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); -$code.=<<___; - #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); - movdqa @x[0], @x[4] - movdqa @x[2], @x[6] - movdqa @x[1], @x[5] - movdqa @x[3], @x[7] -___ - &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); - &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); -} - -$code.=<<___; -.type _bsaes_key_convert,\@abi-omnipotent -.align 16 -_bsaes_key_convert: -.cfi_startproc - lea .Lmasks(%rip), $const - movdqu ($inp), %xmm7 # load round 0 key - lea 0x10($inp), $inp - movdqa 0x00($const), %xmm0 # 0x01... - movdqa 0x10($const), %xmm1 # 0x02... - movdqa 0x20($const), %xmm2 # 0x04... - movdqa 0x30($const), %xmm3 # 0x08... - movdqa 0x40($const), %xmm4 # .LM0 - pcmpeqd %xmm5, %xmm5 # .LNOT - - movdqu ($inp), %xmm6 # load round 1 key - movdqa %xmm7, ($out) # save round 0 key - lea 0x10($out), $out - dec $rounds - jmp .Lkey_loop -.align 16 -.Lkey_loop: - pshufb %xmm4, %xmm6 # .LM0 - - movdqa %xmm0, %xmm8 - movdqa %xmm1, %xmm9 - - pand %xmm6, %xmm8 - pand %xmm6, %xmm9 - movdqa %xmm2, %xmm10 - pcmpeqb %xmm0, %xmm8 - psllq \$4, %xmm0 # 0x10... - movdqa %xmm3, %xmm11 - pcmpeqb %xmm1, %xmm9 - psllq \$4, %xmm1 # 0x20... - - pand %xmm6, %xmm10 - pand %xmm6, %xmm11 - movdqa %xmm0, %xmm12 - pcmpeqb %xmm2, %xmm10 - psllq \$4, %xmm2 # 0x40... - movdqa %xmm1, %xmm13 - pcmpeqb %xmm3, %xmm11 - psllq \$4, %xmm3 # 0x80... - - movdqa %xmm2, %xmm14 - movdqa %xmm3, %xmm15 - pxor %xmm5, %xmm8 # "pnot" - pxor %xmm5, %xmm9 - - pand %xmm6, %xmm12 - pand %xmm6, %xmm13 - movdqa %xmm8, 0x00($out) # write bit-sliced round key - pcmpeqb %xmm0, %xmm12 - psrlq \$4, %xmm0 # 0x01... - movdqa %xmm9, 0x10($out) - pcmpeqb %xmm1, %xmm13 - psrlq \$4, %xmm1 # 0x02... - lea 0x10($inp), $inp - - pand %xmm6, %xmm14 - pand %xmm6, %xmm15 - movdqa %xmm10, 0x20($out) - pcmpeqb %xmm2, %xmm14 - psrlq \$4, %xmm2 # 0x04... - movdqa %xmm11, 0x30($out) - pcmpeqb %xmm3, %xmm15 - psrlq \$4, %xmm3 # 0x08... - movdqu ($inp), %xmm6 # load next round key - - pxor %xmm5, %xmm13 # "pnot" - pxor %xmm5, %xmm14 - movdqa %xmm12, 0x40($out) - movdqa %xmm13, 0x50($out) - movdqa %xmm14, 0x60($out) - movdqa %xmm15, 0x70($out) - lea 0x80($out),$out - dec $rounds - jnz .Lkey_loop - - movdqa 0x50($const), %xmm7 # .L63 - #movdqa %xmm6, ($out) # don't save last round key - ret -.cfi_endproc -.size _bsaes_key_convert,.-_bsaes_key_convert -___ -} - -if (0 && !$win64) { # following four functions are unsupported interface - # used for benchmarking... -$code.=<<___; -.globl bsaes_enc_key_convert -.type bsaes_enc_key_convert,\@function,2 -.align 16 -bsaes_enc_key_convert: - mov 240($inp),%r10d # pass rounds - mov $inp,%rcx # pass key - mov $out,%rax # pass key schedule - call _bsaes_key_convert - pxor %xmm6,%xmm7 # fix up last round key - movdqa %xmm7,(%rax) # save last round key - ret -.size bsaes_enc_key_convert,.-bsaes_enc_key_convert - -.globl bsaes_encrypt_128 -.type bsaes_encrypt_128,\@function,4 -.align 16 -bsaes_encrypt_128: -.Lenc128_loop: - movdqu 0x00($inp), @XMM[0] # load input - movdqu 0x10($inp), @XMM[1] - movdqu 0x20($inp), @XMM[2] - movdqu 0x30($inp), @XMM[3] - movdqu 0x40($inp), @XMM[4] - movdqu 0x50($inp), @XMM[5] - movdqu 0x60($inp), @XMM[6] - movdqu 0x70($inp), @XMM[7] - mov $key, %rax # pass the $key - lea 0x80($inp), $inp - mov \$10,%r10d - - call _bsaes_encrypt8 - - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - movdqu @XMM[3], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[2], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - sub \$0x80,$len - ja .Lenc128_loop - ret -.size bsaes_encrypt_128,.-bsaes_encrypt_128 - -.globl bsaes_dec_key_convert -.type bsaes_dec_key_convert,\@function,2 -.align 16 -bsaes_dec_key_convert: - mov 240($inp),%r10d # pass rounds - mov $inp,%rcx # pass key - mov $out,%rax # pass key schedule - call _bsaes_key_convert - pxor ($out),%xmm7 # fix up round 0 key - movdqa %xmm6,(%rax) # save last round key - movdqa %xmm7,($out) - ret -.size bsaes_dec_key_convert,.-bsaes_dec_key_convert - -.globl bsaes_decrypt_128 -.type bsaes_decrypt_128,\@function,4 -.align 16 -bsaes_decrypt_128: -.Ldec128_loop: - movdqu 0x00($inp), @XMM[0] # load input - movdqu 0x10($inp), @XMM[1] - movdqu 0x20($inp), @XMM[2] - movdqu 0x30($inp), @XMM[3] - movdqu 0x40($inp), @XMM[4] - movdqu 0x50($inp), @XMM[5] - movdqu 0x60($inp), @XMM[6] - movdqu 0x70($inp), @XMM[7] - mov $key, %rax # pass the $key - lea 0x80($inp), $inp - mov \$10,%r10d - - call _bsaes_decrypt8 - - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[3], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - sub \$0x80,$len - ja .Ldec128_loop - ret -.size bsaes_decrypt_128,.-bsaes_decrypt_128 -___ -} -{ -###################################################################### -# -# OpenSSL interface -# -my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") - : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); -my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); - -if ($ecb) { -$code.=<<___; -.globl bsaes_ecb_encrypt_blocks -.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent -.align 16 -bsaes_ecb_encrypt_blocks: -.cfi_startproc - mov %rsp, %rax -.Lecb_enc_prologue: - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -0x48(%rsp),%rsp -.cfi_adjust_cfa_offset 0x48 -___ -$code.=<<___ if ($win64); - lea -0xa0(%rsp), %rsp - movaps %xmm6, 0x40(%rsp) - movaps %xmm7, 0x50(%rsp) - movaps %xmm8, 0x60(%rsp) - movaps %xmm9, 0x70(%rsp) - movaps %xmm10, 0x80(%rsp) - movaps %xmm11, 0x90(%rsp) - movaps %xmm12, 0xa0(%rsp) - movaps %xmm13, 0xb0(%rsp) - movaps %xmm14, 0xc0(%rsp) - movaps %xmm15, 0xd0(%rsp) -.Lecb_enc_body: -___ -$code.=<<___; - mov %rsp,%rbp # backup %rsp -.cfi_def_cfa_register %rbp - mov 240($arg4),%eax # rounds - mov $arg1,$inp # backup arguments - mov $arg2,$out - mov $arg3,$len - mov $arg4,$key - cmp \$8,$arg3 - jb .Lecb_enc_short - - mov %eax,%ebx # backup rounds - shl \$7,%rax # 128 bytes per inner round key - sub \$`128-32`,%rax # size of bit-sliced key schedule - sub %rax,%rsp - mov %rsp,%rax # pass key schedule - mov $key,%rcx # pass key - mov %ebx,%r10d # pass rounds - call _bsaes_key_convert - pxor %xmm6,%xmm7 # fix up last round key - movdqa %xmm7,(%rax) # save last round key - - sub \$8,$len -.Lecb_enc_loop: - movdqu 0x00($inp), @XMM[0] # load input - movdqu 0x10($inp), @XMM[1] - movdqu 0x20($inp), @XMM[2] - movdqu 0x30($inp), @XMM[3] - movdqu 0x40($inp), @XMM[4] - movdqu 0x50($inp), @XMM[5] - mov %rsp, %rax # pass key schedule - movdqu 0x60($inp), @XMM[6] - mov %ebx,%r10d # pass rounds - movdqu 0x70($inp), @XMM[7] - lea 0x80($inp), $inp - - call _bsaes_encrypt8 - - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - movdqu @XMM[3], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[2], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - sub \$8,$len - jnc .Lecb_enc_loop - - add \$8,$len - jz .Lecb_enc_done - - movdqu 0x00($inp), @XMM[0] # load input - mov %rsp, %rax # pass key schedule - mov %ebx,%r10d # pass rounds - cmp \$2,$len - jb .Lecb_enc_one - movdqu 0x10($inp), @XMM[1] - je .Lecb_enc_two - movdqu 0x20($inp), @XMM[2] - cmp \$4,$len - jb .Lecb_enc_three - movdqu 0x30($inp), @XMM[3] - je .Lecb_enc_four - movdqu 0x40($inp), @XMM[4] - cmp \$6,$len - jb .Lecb_enc_five - movdqu 0x50($inp), @XMM[5] - je .Lecb_enc_six - movdqu 0x60($inp), @XMM[6] - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - movdqu @XMM[3], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[2], 0x60($out) - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_six: - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - movdqu @XMM[3], 0x40($out) - movdqu @XMM[7], 0x50($out) - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_five: - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - movdqu @XMM[3], 0x40($out) - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_four: - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_three: - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_two: - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_one: - call _bsaes_encrypt8 - movdqu @XMM[0], 0x00($out) # write output - jmp .Lecb_enc_done -.align 16 -.Lecb_enc_short: - lea ($inp), $arg1 - lea ($out), $arg2 - lea ($key), $arg3 - call aes_nohw_encrypt - lea 16($inp), $inp - lea 16($out), $out - dec $len - jnz .Lecb_enc_short - -.Lecb_enc_done: - lea (%rsp),%rax - pxor %xmm0, %xmm0 -.Lecb_enc_bzero: # wipe key schedule [if any] - movdqa %xmm0, 0x00(%rax) - movdqa %xmm0, 0x10(%rax) - lea 0x20(%rax), %rax - cmp %rax, %rbp - jb .Lecb_enc_bzero - - lea 0x78(%rbp),%rax -.cfi_def_cfa %rax,8 -___ -$code.=<<___ if ($win64); - movaps 0x40(%rbp), %xmm6 - movaps 0x50(%rbp), %xmm7 - movaps 0x60(%rbp), %xmm8 - movaps 0x70(%rbp), %xmm9 - movaps 0x80(%rbp), %xmm10 - movaps 0x90(%rbp), %xmm11 - movaps 0xa0(%rbp), %xmm12 - movaps 0xb0(%rbp), %xmm13 - movaps 0xc0(%rbp), %xmm14 - movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rax), %rax -.Lecb_enc_tail: -___ -$code.=<<___; - mov -48(%rax), %r15 -.cfi_restore %r15 - mov -40(%rax), %r14 -.cfi_restore %r14 - mov -32(%rax), %r13 -.cfi_restore %r13 - mov -24(%rax), %r12 -.cfi_restore %r12 - mov -16(%rax), %rbx -.cfi_restore %rbx - mov -8(%rax), %rbp -.cfi_restore %rbp - lea (%rax), %rsp # restore %rsp -.cfi_def_cfa_register %rsp -.Lecb_enc_epilogue: - ret -.cfi_endproc -.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks - -.globl bsaes_ecb_decrypt_blocks -.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent -.align 16 -bsaes_ecb_decrypt_blocks: -.cfi_startproc - mov %rsp, %rax -.Lecb_dec_prologue: - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -0x48(%rsp),%rsp -.cfi_adjust_cfa_offset 0x48 -___ -$code.=<<___ if ($win64); - lea -0xa0(%rsp), %rsp - movaps %xmm6, 0x40(%rsp) - movaps %xmm7, 0x50(%rsp) - movaps %xmm8, 0x60(%rsp) - movaps %xmm9, 0x70(%rsp) - movaps %xmm10, 0x80(%rsp) - movaps %xmm11, 0x90(%rsp) - movaps %xmm12, 0xa0(%rsp) - movaps %xmm13, 0xb0(%rsp) - movaps %xmm14, 0xc0(%rsp) - movaps %xmm15, 0xd0(%rsp) -.Lecb_dec_body: -___ -$code.=<<___; - mov %rsp,%rbp # backup %rsp -.cfi_def_cfa_register %rbp - mov 240($arg4),%eax # rounds - mov $arg1,$inp # backup arguments - mov $arg2,$out - mov $arg3,$len - mov $arg4,$key - cmp \$8,$arg3 - jb .Lecb_dec_short - - mov %eax,%ebx # backup rounds - shl \$7,%rax # 128 bytes per inner round key - sub \$`128-32`,%rax # size of bit-sliced key schedule - sub %rax,%rsp - mov %rsp,%rax # pass key schedule - mov $key,%rcx # pass key - mov %ebx,%r10d # pass rounds - call _bsaes_key_convert - pxor (%rsp),%xmm7 # fix up 0 round key - movdqa %xmm6,(%rax) # save last round key - movdqa %xmm7,(%rsp) - - sub \$8,$len -.Lecb_dec_loop: - movdqu 0x00($inp), @XMM[0] # load input - movdqu 0x10($inp), @XMM[1] - movdqu 0x20($inp), @XMM[2] - movdqu 0x30($inp), @XMM[3] - movdqu 0x40($inp), @XMM[4] - movdqu 0x50($inp), @XMM[5] - mov %rsp, %rax # pass key schedule - movdqu 0x60($inp), @XMM[6] - mov %ebx,%r10d # pass rounds - movdqu 0x70($inp), @XMM[7] - lea 0x80($inp), $inp - - call _bsaes_decrypt8 - - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[3], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - sub \$8,$len - jnc .Lecb_dec_loop - - add \$8,$len - jz .Lecb_dec_done - - movdqu 0x00($inp), @XMM[0] # load input - mov %rsp, %rax # pass key schedule - mov %ebx,%r10d # pass rounds - cmp \$2,$len - jb .Lecb_dec_one - movdqu 0x10($inp), @XMM[1] - je .Lecb_dec_two - movdqu 0x20($inp), @XMM[2] - cmp \$4,$len - jb .Lecb_dec_three - movdqu 0x30($inp), @XMM[3] - je .Lecb_dec_four - movdqu 0x40($inp), @XMM[4] - cmp \$6,$len - jb .Lecb_dec_five - movdqu 0x50($inp), @XMM[5] - je .Lecb_dec_six - movdqu 0x60($inp), @XMM[6] - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[3], 0x60($out) - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_six: - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_five: - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_four: - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_three: - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_two: - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_one: - call _bsaes_decrypt8 - movdqu @XMM[0], 0x00($out) # write output - jmp .Lecb_dec_done -.align 16 -.Lecb_dec_short: - lea ($inp), $arg1 - lea ($out), $arg2 - lea ($key), $arg3 - call aes_nohw_decrypt - lea 16($inp), $inp - lea 16($out), $out - dec $len - jnz .Lecb_dec_short - -.Lecb_dec_done: - lea (%rsp),%rax - pxor %xmm0, %xmm0 -.Lecb_dec_bzero: # wipe key schedule [if any] - movdqa %xmm0, 0x00(%rax) - movdqa %xmm0, 0x10(%rax) - lea 0x20(%rax), %rax - cmp %rax, %rbp - jb .Lecb_dec_bzero - - lea 0x78(%rbp),%rax -.cfi_def_cfa %rax,8 -___ -$code.=<<___ if ($win64); - movaps 0x40(%rbp), %xmm6 - movaps 0x50(%rbp), %xmm7 - movaps 0x60(%rbp), %xmm8 - movaps 0x70(%rbp), %xmm9 - movaps 0x80(%rbp), %xmm10 - movaps 0x90(%rbp), %xmm11 - movaps 0xa0(%rbp), %xmm12 - movaps 0xb0(%rbp), %xmm13 - movaps 0xc0(%rbp), %xmm14 - movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rax), %rax -.Lecb_dec_tail: -___ -$code.=<<___; - mov -48(%rax), %r15 -.cfi_restore %r15 - mov -40(%rax), %r14 -.cfi_restore %r14 - mov -32(%rax), %r13 -.cfi_restore %r13 - mov -24(%rax), %r12 -.cfi_restore %r12 - mov -16(%rax), %rbx -.cfi_restore %rbx - mov -8(%rax), %rbp -.cfi_restore %rbp - lea (%rax), %rsp # restore %rsp -.cfi_def_cfa_register %rsp -.Lecb_dec_epilogue: - ret -.cfi_endproc -.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks -___ -} -$code.=<<___; -.globl bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,\@abi-omnipotent -.align 16 -bsaes_cbc_encrypt: -.cfi_startproc - # In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for - # short inputs or if enc is one. We patch this out, using bsaes for all - # input sizes. The caller is required to ensure enc is zero. - mov %rsp, %rax -.Lcbc_dec_prologue: - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -0x48(%rsp), %rsp -.cfi_adjust_cfa_offset 0x48 -___ -$code.=<<___ if ($win64); - mov 0xa0(%rsp),$arg5 # pull ivp - lea -0xa0(%rsp), %rsp - movaps %xmm6, 0x40(%rsp) - movaps %xmm7, 0x50(%rsp) - movaps %xmm8, 0x60(%rsp) - movaps %xmm9, 0x70(%rsp) - movaps %xmm10, 0x80(%rsp) - movaps %xmm11, 0x90(%rsp) - movaps %xmm12, 0xa0(%rsp) - movaps %xmm13, 0xb0(%rsp) - movaps %xmm14, 0xc0(%rsp) - movaps %xmm15, 0xd0(%rsp) -.Lcbc_dec_body: -___ -$code.=<<___; - mov %rsp, %rbp # backup %rsp -.cfi_def_cfa_register %rbp - mov 240($arg4), %eax # rounds - mov $arg1, $inp # backup arguments - mov $arg2, $out - mov $arg3, $len - mov $arg4, $key - mov $arg5, %rbx - shr \$4, $len # bytes to blocks - - mov %eax, %edx # rounds - shl \$7, %rax # 128 bytes per inner round key - sub \$`128-32`, %rax # size of bit-sliced key schedule - sub %rax, %rsp - - mov %rsp, %rax # pass key schedule - mov $key, %rcx # pass key - mov %edx, %r10d # pass rounds - call _bsaes_key_convert - pxor (%rsp),%xmm7 # fix up 0 round key - movdqa %xmm6,(%rax) # save last round key - movdqa %xmm7,(%rsp) - - movdqu (%rbx), @XMM[15] # load IV - sub \$8,$len - jc .Lcbc_dec_loop_done - -.Lcbc_dec_loop: - movdqu 0x00($inp), @XMM[0] # load input - movdqu 0x10($inp), @XMM[1] - movdqu 0x20($inp), @XMM[2] - movdqu 0x30($inp), @XMM[3] - movdqu 0x40($inp), @XMM[4] - movdqu 0x50($inp), @XMM[5] - mov %rsp, %rax # pass key schedule - movdqu 0x60($inp), @XMM[6] - mov %edx,%r10d # pass rounds - movdqu 0x70($inp), @XMM[7] - movdqa @XMM[15], 0x20(%rbp) # put aside IV - - call _bsaes_decrypt8 - - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[9] - pxor @XMM[8], @XMM[1] - movdqu 0x20($inp), @XMM[10] - pxor @XMM[9], @XMM[6] - movdqu 0x30($inp), @XMM[11] - pxor @XMM[10], @XMM[4] - movdqu 0x40($inp), @XMM[12] - pxor @XMM[11], @XMM[2] - movdqu 0x50($inp), @XMM[13] - pxor @XMM[12], @XMM[7] - movdqu 0x60($inp), @XMM[14] - pxor @XMM[13], @XMM[3] - movdqu 0x70($inp), @XMM[15] # IV - pxor @XMM[14], @XMM[5] - movdqu @XMM[0], 0x00($out) # write output - lea 0x80($inp), $inp - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[3], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - sub \$8,$len - jnc .Lcbc_dec_loop - -.Lcbc_dec_loop_done: - add \$8,$len - jz .Lcbc_dec_done - - movdqu 0x00($inp), @XMM[0] # load input - mov %rsp, %rax # pass key schedule - mov %edx, %r10d # pass rounds - cmp \$2,$len - jb .Lcbc_dec_one - movdqu 0x10($inp), @XMM[1] - je .Lcbc_dec_two - movdqu 0x20($inp), @XMM[2] - cmp \$4,$len - jb .Lcbc_dec_three - movdqu 0x30($inp), @XMM[3] - je .Lcbc_dec_four - movdqu 0x40($inp), @XMM[4] - cmp \$6,$len - jb .Lcbc_dec_five - movdqu 0x50($inp), @XMM[5] - je .Lcbc_dec_six - movdqu 0x60($inp), @XMM[6] - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[9] - pxor @XMM[8], @XMM[1] - movdqu 0x20($inp), @XMM[10] - pxor @XMM[9], @XMM[6] - movdqu 0x30($inp), @XMM[11] - pxor @XMM[10], @XMM[4] - movdqu 0x40($inp), @XMM[12] - pxor @XMM[11], @XMM[2] - movdqu 0x50($inp), @XMM[13] - pxor @XMM[12], @XMM[7] - movdqu 0x60($inp), @XMM[15] # IV - pxor @XMM[13], @XMM[3] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - movdqu @XMM[3], 0x60($out) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_six: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[9] - pxor @XMM[8], @XMM[1] - movdqu 0x20($inp), @XMM[10] - pxor @XMM[9], @XMM[6] - movdqu 0x30($inp), @XMM[11] - pxor @XMM[10], @XMM[4] - movdqu 0x40($inp), @XMM[12] - pxor @XMM[11], @XMM[2] - movdqu 0x50($inp), @XMM[15] # IV - pxor @XMM[12], @XMM[7] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_five: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[9] - pxor @XMM[8], @XMM[1] - movdqu 0x20($inp), @XMM[10] - pxor @XMM[9], @XMM[6] - movdqu 0x30($inp), @XMM[11] - pxor @XMM[10], @XMM[4] - movdqu 0x40($inp), @XMM[15] # IV - pxor @XMM[11], @XMM[2] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_four: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[9] - pxor @XMM[8], @XMM[1] - movdqu 0x20($inp), @XMM[10] - pxor @XMM[9], @XMM[6] - movdqu 0x30($inp), @XMM[15] # IV - pxor @XMM[10], @XMM[4] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_three: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[9] - pxor @XMM[8], @XMM[1] - movdqu 0x20($inp), @XMM[15] # IV - pxor @XMM[9], @XMM[6] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_two: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[8] # re-load input - movdqu 0x10($inp), @XMM[15] # IV - pxor @XMM[8], @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_one: - movdqa @XMM[15], 0x20(%rbp) # put aside IV - call _bsaes_decrypt8 - pxor 0x20(%rbp), @XMM[0] # ^= IV - movdqu 0x00($inp), @XMM[15] # IV - movdqu @XMM[0], 0x00($out) # write output - jmp .Lcbc_dec_done - -.Lcbc_dec_done: - movdqu @XMM[15], (%rbx) # return IV - lea (%rsp), %rax - pxor %xmm0, %xmm0 -.Lcbc_dec_bzero: # wipe key schedule [if any] - movdqa %xmm0, 0x00(%rax) - movdqa %xmm0, 0x10(%rax) - lea 0x20(%rax), %rax - cmp %rax, %rbp - ja .Lcbc_dec_bzero - - lea 0x78(%rbp),%rax -.cfi_def_cfa %rax,8 -___ -$code.=<<___ if ($win64); - movaps 0x40(%rbp), %xmm6 - movaps 0x50(%rbp), %xmm7 - movaps 0x60(%rbp), %xmm8 - movaps 0x70(%rbp), %xmm9 - movaps 0x80(%rbp), %xmm10 - movaps 0x90(%rbp), %xmm11 - movaps 0xa0(%rbp), %xmm12 - movaps 0xb0(%rbp), %xmm13 - movaps 0xc0(%rbp), %xmm14 - movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rax), %rax -.Lcbc_dec_tail: -___ -$code.=<<___; - mov -48(%rax), %r15 -.cfi_restore %r15 - mov -40(%rax), %r14 -.cfi_restore %r14 - mov -32(%rax), %r13 -.cfi_restore %r13 - mov -24(%rax), %r12 -.cfi_restore %r12 - mov -16(%rax), %rbx -.cfi_restore %rbx - mov -8(%rax), %rbp -.cfi_restore %rbp - lea (%rax), %rsp # restore %rsp -.cfi_def_cfa_register %rsp -.Lcbc_dec_epilogue: - ret -.cfi_endproc -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - -.globl bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent -.align 16 -bsaes_ctr32_encrypt_blocks: -.cfi_startproc -#ifndef NDEBUG -#ifndef BORINGSSL_FIPS -.extern BORINGSSL_function_hit - movb \$1, BORINGSSL_function_hit+6(%rip) -#endif -#endif - mov %rsp, %rax -.Lctr_enc_prologue: - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -0x48(%rsp), %rsp -.cfi_adjust_cfa_offset 0x48 -___ -$code.=<<___ if ($win64); - mov 0xa0(%rsp),$arg5 # pull ivp - lea -0xa0(%rsp), %rsp - movaps %xmm6, 0x40(%rsp) - movaps %xmm7, 0x50(%rsp) - movaps %xmm8, 0x60(%rsp) - movaps %xmm9, 0x70(%rsp) - movaps %xmm10, 0x80(%rsp) - movaps %xmm11, 0x90(%rsp) - movaps %xmm12, 0xa0(%rsp) - movaps %xmm13, 0xb0(%rsp) - movaps %xmm14, 0xc0(%rsp) - movaps %xmm15, 0xd0(%rsp) -.Lctr_enc_body: -___ -$code.=<<___; - mov %rsp, %rbp # backup %rsp -.cfi_def_cfa_register %rbp - movdqu ($arg5), %xmm0 # load counter - mov 240($arg4), %eax # rounds - mov $arg1, $inp # backup arguments - mov $arg2, $out - mov $arg3, $len - mov $arg4, $key - movdqa %xmm0, 0x20(%rbp) # copy counter - # In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this - # out to retain a constant-time implementation. - - mov %eax, %ebx # rounds - shl \$7, %rax # 128 bytes per inner round key - sub \$`128-32`, %rax # size of bit-sliced key schedule - sub %rax, %rsp - - mov %rsp, %rax # pass key schedule - mov $key, %rcx # pass key - mov %ebx, %r10d # pass rounds - call _bsaes_key_convert - pxor %xmm6,%xmm7 # fix up last round key - movdqa %xmm7,(%rax) # save last round key - - movdqa (%rsp), @XMM[9] # load round0 key - lea .LADD1(%rip), %r11 - movdqa 0x20(%rbp), @XMM[0] # counter copy - movdqa -0x20(%r11), @XMM[8] # .LSWPUP - pshufb @XMM[8], @XMM[9] # byte swap upper part - pshufb @XMM[8], @XMM[0] - movdqa @XMM[9], (%rsp) # save adjusted round0 key - jmp .Lctr_enc_loop -.align 16 -.Lctr_enc_loop: - movdqa @XMM[0], 0x20(%rbp) # save counter - movdqa @XMM[0], @XMM[1] # prepare 8 counter values - movdqa @XMM[0], @XMM[2] - paddd 0x00(%r11), @XMM[1] # .LADD1 - movdqa @XMM[0], @XMM[3] - paddd 0x10(%r11), @XMM[2] # .LADD2 - movdqa @XMM[0], @XMM[4] - paddd 0x20(%r11), @XMM[3] # .LADD3 - movdqa @XMM[0], @XMM[5] - paddd 0x30(%r11), @XMM[4] # .LADD4 - movdqa @XMM[0], @XMM[6] - paddd 0x40(%r11), @XMM[5] # .LADD5 - movdqa @XMM[0], @XMM[7] - paddd 0x50(%r11), @XMM[6] # .LADD6 - paddd 0x60(%r11), @XMM[7] # .LADD7 - - # Borrow prologue from _bsaes_encrypt8 to use the opportunity - # to flip byte order in 32-bit counter - movdqa (%rsp), @XMM[9] # round 0 key - lea 0x10(%rsp), %rax # pass key schedule - movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR - pxor @XMM[9], @XMM[0] # xor with round0 key - pxor @XMM[9], @XMM[1] - pxor @XMM[9], @XMM[2] - pxor @XMM[9], @XMM[3] - pshufb @XMM[8], @XMM[0] - pshufb @XMM[8], @XMM[1] - pxor @XMM[9], @XMM[4] - pxor @XMM[9], @XMM[5] - pshufb @XMM[8], @XMM[2] - pshufb @XMM[8], @XMM[3] - pxor @XMM[9], @XMM[6] - pxor @XMM[9], @XMM[7] - pshufb @XMM[8], @XMM[4] - pshufb @XMM[8], @XMM[5] - pshufb @XMM[8], @XMM[6] - pshufb @XMM[8], @XMM[7] - lea .LBS0(%rip), %r11 # constants table - mov %ebx,%r10d # pass rounds - - call _bsaes_encrypt8_bitslice - - sub \$8,$len - jc .Lctr_enc_loop_done - - movdqu 0x00($inp), @XMM[8] # load input - movdqu 0x10($inp), @XMM[9] - movdqu 0x20($inp), @XMM[10] - movdqu 0x30($inp), @XMM[11] - movdqu 0x40($inp), @XMM[12] - movdqu 0x50($inp), @XMM[13] - movdqu 0x60($inp), @XMM[14] - movdqu 0x70($inp), @XMM[15] - lea 0x80($inp),$inp - pxor @XMM[0], @XMM[8] - movdqa 0x20(%rbp), @XMM[0] # load counter - pxor @XMM[9], @XMM[1] - movdqu @XMM[8], 0x00($out) # write output - pxor @XMM[10], @XMM[4] - movdqu @XMM[1], 0x10($out) - pxor @XMM[11], @XMM[6] - movdqu @XMM[4], 0x20($out) - pxor @XMM[12], @XMM[3] - movdqu @XMM[6], 0x30($out) - pxor @XMM[13], @XMM[7] - movdqu @XMM[3], 0x40($out) - pxor @XMM[14], @XMM[2] - movdqu @XMM[7], 0x50($out) - pxor @XMM[15], @XMM[5] - movdqu @XMM[2], 0x60($out) - lea .LADD1(%rip), %r11 - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - paddd 0x70(%r11), @XMM[0] # .LADD8 - jnz .Lctr_enc_loop - - jmp .Lctr_enc_done -.align 16 -.Lctr_enc_loop_done: - add \$8, $len - movdqu 0x00($inp), @XMM[8] # load input - pxor @XMM[8], @XMM[0] - movdqu @XMM[0], 0x00($out) # write output - cmp \$2,$len - jb .Lctr_enc_done - movdqu 0x10($inp), @XMM[9] - pxor @XMM[9], @XMM[1] - movdqu @XMM[1], 0x10($out) - je .Lctr_enc_done - movdqu 0x20($inp), @XMM[10] - pxor @XMM[10], @XMM[4] - movdqu @XMM[4], 0x20($out) - cmp \$4,$len - jb .Lctr_enc_done - movdqu 0x30($inp), @XMM[11] - pxor @XMM[11], @XMM[6] - movdqu @XMM[6], 0x30($out) - je .Lctr_enc_done - movdqu 0x40($inp), @XMM[12] - pxor @XMM[12], @XMM[3] - movdqu @XMM[3], 0x40($out) - cmp \$6,$len - jb .Lctr_enc_done - movdqu 0x50($inp), @XMM[13] - pxor @XMM[13], @XMM[7] - movdqu @XMM[7], 0x50($out) - je .Lctr_enc_done - movdqu 0x60($inp), @XMM[14] - pxor @XMM[14], @XMM[2] - movdqu @XMM[2], 0x60($out) - - # OpenSSL contains aes_nohw_* fallback code here. We patch this - # out to retain a constant-time implementation. -.Lctr_enc_done: - lea (%rsp), %rax - pxor %xmm0, %xmm0 -.Lctr_enc_bzero: # wipe key schedule [if any] - movdqa %xmm0, 0x00(%rax) - movdqa %xmm0, 0x10(%rax) - lea 0x20(%rax), %rax - cmp %rax, %rbp - ja .Lctr_enc_bzero - - lea 0x78(%rbp),%rax -.cfi_def_cfa %rax,8 -___ -$code.=<<___ if ($win64); - movaps 0x40(%rbp), %xmm6 - movaps 0x50(%rbp), %xmm7 - movaps 0x60(%rbp), %xmm8 - movaps 0x70(%rbp), %xmm9 - movaps 0x80(%rbp), %xmm10 - movaps 0x90(%rbp), %xmm11 - movaps 0xa0(%rbp), %xmm12 - movaps 0xb0(%rbp), %xmm13 - movaps 0xc0(%rbp), %xmm14 - movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rax), %rax -.Lctr_enc_tail: -___ -$code.=<<___; - mov -48(%rax), %r15 -.cfi_restore %r15 - mov -40(%rax), %r14 -.cfi_restore %r14 - mov -32(%rax), %r13 -.cfi_restore %r13 - mov -24(%rax), %r12 -.cfi_restore %r12 - mov -16(%rax), %rbx -.cfi_restore %rbx - mov -8(%rax), %rbp -.cfi_restore %rbp - lea (%rax), %rsp # restore %rsp -.cfi_def_cfa_register %rsp -.Lctr_enc_epilogue: - ret -.cfi_endproc -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -___ -###################################################################### -# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, -# const AES_KEY *key1, const AES_KEY *key2, -# const unsigned char iv[16]); -# -# We patch out the XTS implementation in BoringSSL. -if ($xts) { -my ($twmask,$twres,$twtmp)=@XMM[13..15]; -$arg6=~s/d$//; - -$code.=<<___; -.globl bsaes_xts_encrypt -.type bsaes_xts_encrypt,\@abi-omnipotent -.align 16 -bsaes_xts_encrypt: -.cfi_startproc - mov %rsp, %rax -.Lxts_enc_prologue: - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -0x48(%rsp), %rsp -.cfi_adjust_cfa_offset 0x48 -___ -$code.=<<___ if ($win64); - mov 0xa0(%rsp),$arg5 # pull key2 - mov 0xa8(%rsp),$arg6 # pull ivp - lea -0xa0(%rsp), %rsp - movaps %xmm6, 0x40(%rsp) - movaps %xmm7, 0x50(%rsp) - movaps %xmm8, 0x60(%rsp) - movaps %xmm9, 0x70(%rsp) - movaps %xmm10, 0x80(%rsp) - movaps %xmm11, 0x90(%rsp) - movaps %xmm12, 0xa0(%rsp) - movaps %xmm13, 0xb0(%rsp) - movaps %xmm14, 0xc0(%rsp) - movaps %xmm15, 0xd0(%rsp) -.Lxts_enc_body: -___ -$code.=<<___; - mov %rsp, %rbp # backup %rsp -.cfi_def_cfa_register %rbp - mov $arg1, $inp # backup arguments - mov $arg2, $out - mov $arg3, $len - mov $arg4, $key - - lea ($arg6), $arg1 - lea 0x20(%rbp), $arg2 - lea ($arg5), $arg3 - call aes_nohw_encrypt # generate initial tweak - - mov 240($key), %eax # rounds - mov $len, %rbx # backup $len - - mov %eax, %edx # rounds - shl \$7, %rax # 128 bytes per inner round key - sub \$`128-32`, %rax # size of bit-sliced key schedule - sub %rax, %rsp - - mov %rsp, %rax # pass key schedule - mov $key, %rcx # pass key - mov %edx, %r10d # pass rounds - call _bsaes_key_convert - pxor %xmm6, %xmm7 # fix up last round key - movdqa %xmm7, (%rax) # save last round key - - and \$-16, $len - sub \$0x80, %rsp # place for tweak[8] - movdqa 0x20(%rbp), @XMM[7] # initial tweak - - pxor $twtmp, $twtmp - movdqa .Lxts_magic(%rip), $twmask - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - - sub \$0x80, $len - jc .Lxts_enc_short - jmp .Lxts_enc_loop - -.align 16 -.Lxts_enc_loop: -___ - for ($i=0;$i<7;$i++) { - $code.=<<___; - pshufd \$0x13, $twtmp, $twres - pxor $twtmp, $twtmp - movdqa @XMM[7], @XMM[$i] - movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - pxor $twres, @XMM[7] -___ - $code.=<<___ if ($i>=1); - movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] -___ - $code.=<<___ if ($i>=2); - pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] -___ - } -$code.=<<___; - movdqu 0x60($inp), @XMM[8+6] - pxor @XMM[8+5], @XMM[5] - movdqu 0x70($inp), @XMM[8+7] - lea 0x80($inp), $inp - movdqa @XMM[7], 0x70(%rsp) - pxor @XMM[8+6], @XMM[6] - lea 0x80(%rsp), %rax # pass key schedule - pxor @XMM[8+7], @XMM[7] - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[4] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[6] - movdqu @XMM[4], 0x20($out) - pxor 0x40(%rsp), @XMM[3] - movdqu @XMM[6], 0x30($out) - pxor 0x50(%rsp), @XMM[7] - movdqu @XMM[3], 0x40($out) - pxor 0x60(%rsp), @XMM[2] - movdqu @XMM[7], 0x50($out) - pxor 0x70(%rsp), @XMM[5] - movdqu @XMM[2], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - - movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak - pxor $twtmp, $twtmp - movdqa .Lxts_magic(%rip), $twmask - pcmpgtd @XMM[7], $twtmp - pshufd \$0x13, $twtmp, $twres - pxor $twtmp, $twtmp - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - pxor $twres, @XMM[7] - - sub \$0x80,$len - jnc .Lxts_enc_loop - -.Lxts_enc_short: - add \$0x80, $len - jz .Lxts_enc_done -___ - for ($i=0;$i<7;$i++) { - $code.=<<___; - pshufd \$0x13, $twtmp, $twres - pxor $twtmp, $twtmp - movdqa @XMM[7], @XMM[$i] - movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - pxor $twres, @XMM[7] -___ - $code.=<<___ if ($i>=1); - movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] - cmp \$`0x10*$i`,$len - je .Lxts_enc_$i -___ - $code.=<<___ if ($i>=2); - pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] -___ - } -$code.=<<___; - movdqu 0x60($inp), @XMM[8+6] - pxor @XMM[8+5], @XMM[5] - movdqa @XMM[7], 0x70(%rsp) - lea 0x70($inp), $inp - pxor @XMM[8+6], @XMM[6] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[4] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[6] - movdqu @XMM[4], 0x20($out) - pxor 0x40(%rsp), @XMM[3] - movdqu @XMM[6], 0x30($out) - pxor 0x50(%rsp), @XMM[7] - movdqu @XMM[3], 0x40($out) - pxor 0x60(%rsp), @XMM[2] - movdqu @XMM[7], 0x50($out) - movdqu @XMM[2], 0x60($out) - lea 0x70($out), $out - - movdqa 0x70(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_6: - pxor @XMM[8+4], @XMM[4] - lea 0x60($inp), $inp - pxor @XMM[8+5], @XMM[5] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[4] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[6] - movdqu @XMM[4], 0x20($out) - pxor 0x40(%rsp), @XMM[3] - movdqu @XMM[6], 0x30($out) - pxor 0x50(%rsp), @XMM[7] - movdqu @XMM[3], 0x40($out) - movdqu @XMM[7], 0x50($out) - lea 0x60($out), $out - - movdqa 0x60(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_5: - pxor @XMM[8+3], @XMM[3] - lea 0x50($inp), $inp - pxor @XMM[8+4], @XMM[4] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[4] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[6] - movdqu @XMM[4], 0x20($out) - pxor 0x40(%rsp), @XMM[3] - movdqu @XMM[6], 0x30($out) - movdqu @XMM[3], 0x40($out) - lea 0x50($out), $out - - movdqa 0x50(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_4: - pxor @XMM[8+2], @XMM[2] - lea 0x40($inp), $inp - pxor @XMM[8+3], @XMM[3] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[4] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[6] - movdqu @XMM[4], 0x20($out) - movdqu @XMM[6], 0x30($out) - lea 0x40($out), $out - - movdqa 0x40(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_3: - pxor @XMM[8+1], @XMM[1] - lea 0x30($inp), $inp - pxor @XMM[8+2], @XMM[2] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[4] - movdqu @XMM[1], 0x10($out) - movdqu @XMM[4], 0x20($out) - lea 0x30($out), $out - - movdqa 0x30(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_2: - pxor @XMM[8+0], @XMM[0] - lea 0x20($inp), $inp - pxor @XMM[8+1], @XMM[1] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_encrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - lea 0x20($out), $out - - movdqa 0x20(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_1: - pxor @XMM[0], @XMM[8] - lea 0x10($inp), $inp - movdqa @XMM[8], 0x20(%rbp) - lea 0x20(%rbp), $arg1 - lea 0x20(%rbp), $arg2 - lea ($key), $arg3 - call aes_nohw_encrypt # doesn't touch %xmm - pxor 0x20(%rbp), @XMM[0] # ^= tweak[] - #pxor @XMM[8], @XMM[0] - #lea 0x80(%rsp), %rax # pass key schedule - #mov %edx, %r10d # pass rounds - #call _bsaes_encrypt8 - #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - movdqu @XMM[0], 0x00($out) # write output - lea 0x10($out), $out - - movdqa 0x10(%rsp), @XMM[7] # next iteration tweak - -.Lxts_enc_done: - and \$15, %ebx - jz .Lxts_enc_ret - mov $out, %rdx - -.Lxts_enc_steal: - movzb ($inp), %eax - movzb -16(%rdx), %ecx - lea 1($inp), $inp - mov %al, -16(%rdx) - mov %cl, 0(%rdx) - lea 1(%rdx), %rdx - sub \$1,%ebx - jnz .Lxts_enc_steal - - movdqu -16($out), @XMM[0] - lea 0x20(%rbp), $arg1 - pxor @XMM[7], @XMM[0] - lea 0x20(%rbp), $arg2 - movdqa @XMM[0], 0x20(%rbp) - lea ($key), $arg3 - call aes_nohw_encrypt # doesn't touch %xmm - pxor 0x20(%rbp), @XMM[7] - movdqu @XMM[7], -16($out) - -.Lxts_enc_ret: - lea (%rsp), %rax - pxor %xmm0, %xmm0 -.Lxts_enc_bzero: # wipe key schedule [if any] - movdqa %xmm0, 0x00(%rax) - movdqa %xmm0, 0x10(%rax) - lea 0x20(%rax), %rax - cmp %rax, %rbp - ja .Lxts_enc_bzero - - lea 0x78(%rbp),%rax -.cfi_def_cfa %rax,8 -___ -$code.=<<___ if ($win64); - movaps 0x40(%rbp), %xmm6 - movaps 0x50(%rbp), %xmm7 - movaps 0x60(%rbp), %xmm8 - movaps 0x70(%rbp), %xmm9 - movaps 0x80(%rbp), %xmm10 - movaps 0x90(%rbp), %xmm11 - movaps 0xa0(%rbp), %xmm12 - movaps 0xb0(%rbp), %xmm13 - movaps 0xc0(%rbp), %xmm14 - movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rax), %rax -.Lxts_enc_tail: -___ -$code.=<<___; - mov -48(%rax), %r15 -.cfi_restore %r15 - mov -40(%rax), %r14 -.cfi_restore %r14 - mov -32(%rax), %r13 -.cfi_restore %r13 - mov -24(%rax), %r12 -.cfi_restore %r12 - mov -16(%rax), %rbx -.cfi_restore %rbx - mov -8(%rax), %rbp -.cfi_restore %rbp - lea (%rax), %rsp # restore %rsp -.cfi_def_cfa_register %rsp -.Lxts_enc_epilogue: - ret -.cfi_endproc -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.type bsaes_xts_decrypt,\@abi-omnipotent -.align 16 -bsaes_xts_decrypt: -.cfi_startproc - mov %rsp, %rax -.Lxts_dec_prologue: - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -0x48(%rsp), %rsp -.cfi_adjust_cfa_offset 0x48 -___ -$code.=<<___ if ($win64); - mov 0xa0(%rsp),$arg5 # pull key2 - mov 0xa8(%rsp),$arg6 # pull ivp - lea -0xa0(%rsp), %rsp - movaps %xmm6, 0x40(%rsp) - movaps %xmm7, 0x50(%rsp) - movaps %xmm8, 0x60(%rsp) - movaps %xmm9, 0x70(%rsp) - movaps %xmm10, 0x80(%rsp) - movaps %xmm11, 0x90(%rsp) - movaps %xmm12, 0xa0(%rsp) - movaps %xmm13, 0xb0(%rsp) - movaps %xmm14, 0xc0(%rsp) - movaps %xmm15, 0xd0(%rsp) -.Lxts_dec_body: -___ -$code.=<<___; - mov %rsp, %rbp # backup %rsp - mov $arg1, $inp # backup arguments - mov $arg2, $out - mov $arg3, $len - mov $arg4, $key - - lea ($arg6), $arg1 - lea 0x20(%rbp), $arg2 - lea ($arg5), $arg3 - call aes_nohw_encrypt # generate initial tweak - - mov 240($key), %eax # rounds - mov $len, %rbx # backup $len - - mov %eax, %edx # rounds - shl \$7, %rax # 128 bytes per inner round key - sub \$`128-32`, %rax # size of bit-sliced key schedule - sub %rax, %rsp - - mov %rsp, %rax # pass key schedule - mov $key, %rcx # pass key - mov %edx, %r10d # pass rounds - call _bsaes_key_convert - pxor (%rsp), %xmm7 # fix up round 0 key - movdqa %xmm6, (%rax) # save last round key - movdqa %xmm7, (%rsp) - - xor %eax, %eax # if ($len%16) len-=16; - and \$-16, $len - test \$15, %ebx - setnz %al - shl \$4, %rax - sub %rax, $len - - sub \$0x80, %rsp # place for tweak[8] - movdqa 0x20(%rbp), @XMM[7] # initial tweak - - pxor $twtmp, $twtmp - movdqa .Lxts_magic(%rip), $twmask - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - - sub \$0x80, $len - jc .Lxts_dec_short - jmp .Lxts_dec_loop - -.align 16 -.Lxts_dec_loop: -___ - for ($i=0;$i<7;$i++) { - $code.=<<___; - pshufd \$0x13, $twtmp, $twres - pxor $twtmp, $twtmp - movdqa @XMM[7], @XMM[$i] - movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - pxor $twres, @XMM[7] -___ - $code.=<<___ if ($i>=1); - movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] -___ - $code.=<<___ if ($i>=2); - pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] -___ - } -$code.=<<___; - movdqu 0x60($inp), @XMM[8+6] - pxor @XMM[8+5], @XMM[5] - movdqu 0x70($inp), @XMM[8+7] - lea 0x80($inp), $inp - movdqa @XMM[7], 0x70(%rsp) - pxor @XMM[8+6], @XMM[6] - lea 0x80(%rsp), %rax # pass key schedule - pxor @XMM[8+7], @XMM[7] - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[6] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[4] - movdqu @XMM[6], 0x20($out) - pxor 0x40(%rsp), @XMM[2] - movdqu @XMM[4], 0x30($out) - pxor 0x50(%rsp), @XMM[7] - movdqu @XMM[2], 0x40($out) - pxor 0x60(%rsp), @XMM[3] - movdqu @XMM[7], 0x50($out) - pxor 0x70(%rsp), @XMM[5] - movdqu @XMM[3], 0x60($out) - movdqu @XMM[5], 0x70($out) - lea 0x80($out), $out - - movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak - pxor $twtmp, $twtmp - movdqa .Lxts_magic(%rip), $twmask - pcmpgtd @XMM[7], $twtmp - pshufd \$0x13, $twtmp, $twres - pxor $twtmp, $twtmp - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - pxor $twres, @XMM[7] - - sub \$0x80,$len - jnc .Lxts_dec_loop - -.Lxts_dec_short: - add \$0x80, $len - jz .Lxts_dec_done -___ - for ($i=0;$i<7;$i++) { - $code.=<<___; - pshufd \$0x13, $twtmp, $twres - pxor $twtmp, $twtmp - movdqa @XMM[7], @XMM[$i] - movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - pcmpgtd @XMM[7], $twtmp # broadcast upper bits - pxor $twres, @XMM[7] -___ - $code.=<<___ if ($i>=1); - movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] - cmp \$`0x10*$i`,$len - je .Lxts_dec_$i -___ - $code.=<<___ if ($i>=2); - pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] -___ - } -$code.=<<___; - movdqu 0x60($inp), @XMM[8+6] - pxor @XMM[8+5], @XMM[5] - movdqa @XMM[7], 0x70(%rsp) - lea 0x70($inp), $inp - pxor @XMM[8+6], @XMM[6] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[6] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[4] - movdqu @XMM[6], 0x20($out) - pxor 0x40(%rsp), @XMM[2] - movdqu @XMM[4], 0x30($out) - pxor 0x50(%rsp), @XMM[7] - movdqu @XMM[2], 0x40($out) - pxor 0x60(%rsp), @XMM[3] - movdqu @XMM[7], 0x50($out) - movdqu @XMM[3], 0x60($out) - lea 0x70($out), $out - - movdqa 0x70(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_6: - pxor @XMM[8+4], @XMM[4] - lea 0x60($inp), $inp - pxor @XMM[8+5], @XMM[5] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[6] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[4] - movdqu @XMM[6], 0x20($out) - pxor 0x40(%rsp), @XMM[2] - movdqu @XMM[4], 0x30($out) - pxor 0x50(%rsp), @XMM[7] - movdqu @XMM[2], 0x40($out) - movdqu @XMM[7], 0x50($out) - lea 0x60($out), $out - - movdqa 0x60(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_5: - pxor @XMM[8+3], @XMM[3] - lea 0x50($inp), $inp - pxor @XMM[8+4], @XMM[4] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[6] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[4] - movdqu @XMM[6], 0x20($out) - pxor 0x40(%rsp), @XMM[2] - movdqu @XMM[4], 0x30($out) - movdqu @XMM[2], 0x40($out) - lea 0x50($out), $out - - movdqa 0x50(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_4: - pxor @XMM[8+2], @XMM[2] - lea 0x40($inp), $inp - pxor @XMM[8+3], @XMM[3] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[6] - movdqu @XMM[1], 0x10($out) - pxor 0x30(%rsp), @XMM[4] - movdqu @XMM[6], 0x20($out) - movdqu @XMM[4], 0x30($out) - lea 0x40($out), $out - - movdqa 0x40(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_3: - pxor @XMM[8+1], @XMM[1] - lea 0x30($inp), $inp - pxor @XMM[8+2], @XMM[2] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - pxor 0x20(%rsp), @XMM[6] - movdqu @XMM[1], 0x10($out) - movdqu @XMM[6], 0x20($out) - lea 0x30($out), $out - - movdqa 0x30(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_2: - pxor @XMM[8+0], @XMM[0] - lea 0x20($inp), $inp - pxor @XMM[8+1], @XMM[1] - lea 0x80(%rsp), %rax # pass key schedule - mov %edx, %r10d # pass rounds - - call _bsaes_decrypt8 - - pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - pxor 0x10(%rsp), @XMM[1] - movdqu @XMM[0], 0x00($out) # write output - movdqu @XMM[1], 0x10($out) - lea 0x20($out), $out - - movdqa 0x20(%rsp), @XMM[7] # next iteration tweak - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_1: - pxor @XMM[0], @XMM[8] - lea 0x10($inp), $inp - movdqa @XMM[8], 0x20(%rbp) - lea 0x20(%rbp), $arg1 - lea 0x20(%rbp), $arg2 - lea ($key), $arg3 - call aes_nohw_decrypt # doesn't touch %xmm - pxor 0x20(%rbp), @XMM[0] # ^= tweak[] - #pxor @XMM[8], @XMM[0] - #lea 0x80(%rsp), %rax # pass key schedule - #mov %edx, %r10d # pass rounds - #call _bsaes_decrypt8 - #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] - movdqu @XMM[0], 0x00($out) # write output - lea 0x10($out), $out - - movdqa 0x10(%rsp), @XMM[7] # next iteration tweak - -.Lxts_dec_done: - and \$15, %ebx - jz .Lxts_dec_ret - - pxor $twtmp, $twtmp - movdqa .Lxts_magic(%rip), $twmask - pcmpgtd @XMM[7], $twtmp - pshufd \$0x13, $twtmp, $twres - movdqa @XMM[7], @XMM[6] - paddq @XMM[7], @XMM[7] # psllq 1,$tweak - pand $twmask, $twres # isolate carry and residue - movdqu ($inp), @XMM[0] - pxor $twres, @XMM[7] - - lea 0x20(%rbp), $arg1 - pxor @XMM[7], @XMM[0] - lea 0x20(%rbp), $arg2 - movdqa @XMM[0], 0x20(%rbp) - lea ($key), $arg3 - call aes_nohw_decrypt # doesn't touch %xmm - pxor 0x20(%rbp), @XMM[7] - mov $out, %rdx - movdqu @XMM[7], ($out) - -.Lxts_dec_steal: - movzb 16($inp), %eax - movzb (%rdx), %ecx - lea 1($inp), $inp - mov %al, (%rdx) - mov %cl, 16(%rdx) - lea 1(%rdx), %rdx - sub \$1,%ebx - jnz .Lxts_dec_steal - - movdqu ($out), @XMM[0] - lea 0x20(%rbp), $arg1 - pxor @XMM[6], @XMM[0] - lea 0x20(%rbp), $arg2 - movdqa @XMM[0], 0x20(%rbp) - lea ($key), $arg3 - call aes_nohw_decrypt # doesn't touch %xmm - pxor 0x20(%rbp), @XMM[6] - movdqu @XMM[6], ($out) - -.Lxts_dec_ret: - lea (%rsp), %rax - pxor %xmm0, %xmm0 -.Lxts_dec_bzero: # wipe key schedule [if any] - movdqa %xmm0, 0x00(%rax) - movdqa %xmm0, 0x10(%rax) - lea 0x20(%rax), %rax - cmp %rax, %rbp - ja .Lxts_dec_bzero - - lea 0x78(%rbp),%rax -.cfi_def_cfa %rax,8 -___ -$code.=<<___ if ($win64); - movaps 0x40(%rbp), %xmm6 - movaps 0x50(%rbp), %xmm7 - movaps 0x60(%rbp), %xmm8 - movaps 0x70(%rbp), %xmm9 - movaps 0x80(%rbp), %xmm10 - movaps 0x90(%rbp), %xmm11 - movaps 0xa0(%rbp), %xmm12 - movaps 0xb0(%rbp), %xmm13 - movaps 0xc0(%rbp), %xmm14 - movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rax), %rax -.Lxts_dec_tail: -___ -$code.=<<___; - mov -48(%rax), %r15 -.cfi_restore %r15 - mov -40(%rax), %r14 -.cfi_restore %r14 - mov -32(%rax), %r13 -.cfi_restore %r13 - mov -24(%rax), %r12 -.cfi_restore %r12 - mov -16(%rax), %rbx -.cfi_restore %rbx - mov -8(%rax), %rbp -.cfi_restore %rbp - lea (%rax), %rsp # restore %rsp -.cfi_def_cfa_register %rsp -.Lxts_dec_epilogue: - ret -.cfi_endproc -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt -___ -} -} # $xts -$code.=<<___; -.type _bsaes_const,\@object -.align 64 -_bsaes_const: -.LM0ISR: # InvShiftRows constants - .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -.LISRM0: - .quad 0x01040b0e0205080f, 0x0306090c00070a0d -.LISR: - .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -.LBS0: # bit-slice constants - .quad 0x5555555555555555, 0x5555555555555555 -.LBS1: - .quad 0x3333333333333333, 0x3333333333333333 -.LBS2: - .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -.LSR: # shiftrows constants - .quad 0x0504070600030201, 0x0f0e0d0c0a09080b -.LSRM0: - .quad 0x0304090e00050a0f, 0x01060b0c0207080d -.LM0SR: - .quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LSWPUP: # byte-swap upper dword - .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 -.LSWPUPM0SR: - .quad 0x0a0d02060c03070b, 0x0004080f05090e01 -.LADD1: # counter increment constants - .quad 0x0000000000000000, 0x0000000100000000 -.LADD2: - .quad 0x0000000000000000, 0x0000000200000000 -.LADD3: - .quad 0x0000000000000000, 0x0000000300000000 -.LADD4: - .quad 0x0000000000000000, 0x0000000400000000 -.LADD5: - .quad 0x0000000000000000, 0x0000000500000000 -.LADD6: - .quad 0x0000000000000000, 0x0000000600000000 -.LADD7: - .quad 0x0000000000000000, 0x0000000700000000 -.LADD8: - .quad 0x0000000000000000, 0x0000000800000000 -.Lxts_magic: - .long 0x87,0,1,0 -.Lmasks: - .quad 0x0101010101010101, 0x0101010101010101 - .quad 0x0202020202020202, 0x0202020202020202 - .quad 0x0404040404040404, 0x0404040404040404 - .quad 0x0808080808080808, 0x0808080808080808 -.LM0: - .quad 0x02060a0e03070b0f, 0x0004080c0105090d -.L63: - .quad 0x6363636363636363, 0x6363636363636363 -.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" -.align 64 -.size _bsaes_const,.-_bsaes_const -___ - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<=prologue label - jbe .Lin_prologue - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_prologue - - mov 8(%r11),%r10d # HandlerData[2] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=tail label - jae .Lin_tail - - mov 160($context),%rax # pull context->Rbp - - lea 0x40(%rax),%rsi # %xmm save area - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) - .long 0xa548f3fc # cld; rep movsq - lea 0xa0+0x78(%rax),%rax # adjust stack pointer - -.Lin_tail: - mov -48(%rax),%rbp - mov -40(%rax),%rbx - mov -32(%rax),%r12 - mov -24(%rax),%r13 - mov -16(%rax),%r14 - mov -8(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R15 - -.Lin_prologue: - mov %rax,152($context) # restore context->Rsp - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$`1232/8`,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - ret -.size se_handler,.-se_handler - -.section .pdata -.align 4 -___ -$code.=<<___ if ($ecb); - .rva .Lecb_enc_prologue - .rva .Lecb_enc_epilogue - .rva .Lecb_enc_info - - .rva .Lecb_dec_prologue - .rva .Lecb_dec_epilogue - .rva .Lecb_dec_info -___ -$code.=<<___; - .rva .Lcbc_dec_prologue - .rva .Lcbc_dec_epilogue - .rva .Lcbc_dec_info - - .rva .Lctr_enc_prologue - .rva .Lctr_enc_epilogue - .rva .Lctr_enc_info -___ -$code.=<<___ if ($xts); - .rva .Lxts_enc_prologue - .rva .Lxts_enc_epilogue - .rva .Lxts_enc_info - - .rva .Lxts_dec_prologue - .rva .Lxts_dec_epilogue - .rva .Lxts_dec_info -___ -$code.=<<___; - -.section .xdata -.align 8 -___ -$code.=<<___ if ($ecb); -.Lecb_enc_info: - .byte 9,0,0,0 - .rva se_handler - .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] - .rva .Lecb_enc_tail - .long 0 -.Lecb_dec_info: - .byte 9,0,0,0 - .rva se_handler - .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] - .rva .Lecb_dec_tail - .long 0 -___ -$code.=<<___; -.Lcbc_dec_info: - .byte 9,0,0,0 - .rva se_handler - .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] - .rva .Lcbc_dec_tail - .long 0 -.Lctr_enc_info: - .byte 9,0,0,0 - .rva se_handler - .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] - .rva .Lctr_enc_tail - .long 0 -___ -$code.=<<___ if ($xts); -.Lxts_enc_info: - .byte 9,0,0,0 - .rva se_handler - .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] - .rva .Lxts_enc_tail - .long 0 -.Lxts_dec_info: - .byte 9,0,0,0 - .rva se_handler - .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] - .rva .Lxts_dec_tail - .long 0 -___ -} - -$code =~ s/\`([^\`]*)\`/eval($1)/gem; - -print $code; - -close STDOUT; diff --git a/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl index 47d9972f..9429344b 100644 --- a/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl +++ b/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl @@ -176,6 +176,181 @@ _vpaes_encrypt_core: .size _vpaes_encrypt_core,.-_vpaes_encrypt_core ## +## _aes_encrypt_core_2x +## +## AES-encrypt %xmm0 and %xmm6 in parallel. +## +## Inputs: +## %xmm0 and %xmm6 = input +## %xmm12-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 and %xmm6 +## Clobbers %xmm1-%xmm5, %xmm7-%xmm11, %r9, %r10, %r11, %rax +## Preserves %xmm14 and %xmm15 +## +## This function stitches two parallel instances of _vpaes_encrypt_core. x86_64 +## provides 16 XMM registers. _vpaes_encrypt_core computes over six registers +## (%xmm0-%xmm5) and additionally uses seven registers with preloaded constants +## from _vpaes_preheat (%xmm9-%xmm15). This does not quite fit two instances, +## so we spill some of %xmm9 through %xmm15 back to memory. We keep %xmm9 and +## %xmm10 in registers as these values are used several times in a row. The +## remainder are read once per round and are spilled to memory. This leaves two +## registers preserved for the caller. +## +## Thus, of the two _vpaes_encrypt_core instances, the first uses (%xmm0-%xmm5) +## as before. The second uses %xmm6-%xmm8,%xmm11-%xmm13. (Add 6 to %xmm2 and +## below. Add 8 to %xmm3 and up.) Instructions in the second instance are +## indented by one space. +## +## +.type _vpaes_encrypt_core_2x,\@abi-omnipotent +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + mov %rdx, %r9 + mov \$16, %r11 + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa %xmm9, %xmm7 + movdqa .Lk_ipt(%rip), %xmm2 # iptlo + movdqa %xmm2, %xmm8 + pandn %xmm0, %xmm1 + pandn %xmm6, %xmm7 + movdqu (%r9), %xmm5 # round0 key + # Also use %xmm5 in the second instance. + psrld \$4, %xmm1 + psrld \$4, %xmm7 + pand %xmm9, %xmm0 + pand %xmm9, %xmm6 + pshufb %xmm0, %xmm2 + pshufb %xmm6, %xmm8 + movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi + movdqa %xmm0, %xmm6 + pshufb %xmm1, %xmm0 + pshufb %xmm7, %xmm6 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm8 + add \$16, %r9 + pxor %xmm2, %xmm0 + pxor %xmm8, %xmm6 + lea .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + # middle of middle round + movdqa .Lk_sb1(%rip), %xmm4 # 4 : sb1u + movdqa .Lk_sb1+16(%rip),%xmm0 # 0 : sb1t + movdqa %xmm4, %xmm12 + movdqa %xmm0, %xmm6 + pshufb %xmm2, %xmm4 # 4 = sb1u + pshufb %xmm8, %xmm12 + pshufb %xmm3, %xmm0 # 0 = sb1t + pshufb %xmm11, %xmm6 + pxor %xmm5, %xmm4 # 4 = sb1u + k + pxor %xmm5, %xmm12 + movdqa .Lk_sb2(%rip), %xmm5 # 4 : sb2u + movdqa %xmm5, %xmm13 + pxor %xmm4, %xmm0 # 0 = A + pxor %xmm12, %xmm6 + movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + # Also use %xmm1 in the second instance. + pshufb %xmm2, %xmm5 # 4 = sb2u + pshufb %xmm8, %xmm13 + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + # Also use %xmm4 in the second instance. + movdqa .Lk_sb2+16(%rip), %xmm2 # 2 : sb2t + movdqa %xmm2, %xmm8 + pshufb %xmm3, %xmm2 # 2 = sb2t + pshufb %xmm11, %xmm8 + movdqa %xmm0, %xmm3 # 3 = A + movdqa %xmm6, %xmm11 + pxor %xmm5, %xmm2 # 2 = 2A + pxor %xmm13, %xmm8 + pshufb %xmm1, %xmm0 # 0 = B + pshufb %xmm1, %xmm6 + add \$16, %r9 # next key + pxor %xmm2, %xmm0 # 0 = 2A+B + pxor %xmm8, %xmm6 + pshufb %xmm4, %xmm3 # 3 = D + pshufb %xmm4, %xmm11 + add \$16, %r11 # next mc + pxor %xmm0, %xmm3 # 3 = 2A+B+D + pxor %xmm6, %xmm11 + pshufb %xmm1, %xmm0 # 0 = 2B+C + pshufb %xmm1, %xmm6 + and \$0x30, %r11 # ... mod 4 + sub \$1,%rax # nr-- + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D + pxor %xmm11, %xmm6 + +.Lenc2x_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + movdqa %xmm9, %xmm7 + movdqa .Lk_inv+16(%rip), %xmm5 # 2 : a/k + movdqa %xmm5, %xmm13 + pandn %xmm0, %xmm1 # 1 = i<<4 + pandn %xmm6, %xmm7 + psrld \$4, %xmm1 # 1 = i + psrld \$4, %xmm7 + pand %xmm9, %xmm0 # 0 = k + pand %xmm9, %xmm6 + pshufb %xmm0, %xmm5 # 2 = a/k + pshufb %xmm6, %xmm13 + movdqa %xmm10, %xmm3 # 3 : 1/i + movdqa %xmm10, %xmm11 + pxor %xmm1, %xmm0 # 0 = j + pxor %xmm7, %xmm6 + pshufb %xmm1, %xmm3 # 3 = 1/i + pshufb %xmm7, %xmm11 + movdqa %xmm10, %xmm4 # 4 : 1/j + movdqa %xmm10, %xmm12 + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k + pxor %xmm13, %xmm11 + pshufb %xmm0, %xmm4 # 4 = 1/j + pshufb %xmm6, %xmm12 + movdqa %xmm10, %xmm2 # 2 : 1/iak + movdqa %xmm10, %xmm8 + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k + pxor %xmm13, %xmm12 + pshufb %xmm3, %xmm2 # 2 = 1/iak + pshufb %xmm11, %xmm8 + movdqa %xmm10, %xmm3 # 3 : 1/jak + movdqa %xmm10, %xmm11 + pxor %xmm0, %xmm2 # 2 = io + pxor %xmm6, %xmm8 + pshufb %xmm4, %xmm3 # 3 = 1/jak + pshufb %xmm12, %xmm11 + movdqu (%r9), %xmm5 + # Also use %xmm5 in the second instance. + pxor %xmm1, %xmm3 # 3 = jo + pxor %xmm7, %xmm11 + jnz .Lenc2x_loop + + # middle of last round + movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + movdqa %xmm4, %xmm12 + movdqa %xmm0, %xmm6 + pshufb %xmm2, %xmm4 # 4 = sbou + pshufb %xmm8, %xmm12 + pxor %xmm5, %xmm4 # 4 = sb1u + k + pxor %xmm5, %xmm12 + pshufb %xmm3, %xmm0 # 0 = sb1t + pshufb %xmm11, %xmm6 + movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + # Also use %xmm1 in the second instance. + pxor %xmm4, %xmm0 # 0 = A + pxor %xmm12, %xmm6 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm6 + ret +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + +## ## Decryption core ## ## Same API as encryption core. @@ -984,6 +1159,111 @@ $code.=<<___; .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ___ } +{ +my ($inp,$out,$blocks,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx","%r8"); +# void vpaes_ctr32_encrypt_blocks(const uint8_t *inp, uint8_t *out, +# size_t blocks, const AES_KEY *key, +# const uint8_t ivp[16]); +$code.=<<___; +.globl ${PREFIX}_ctr32_encrypt_blocks +.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 +.align 16 +${PREFIX}_ctr32_encrypt_blocks: +.cfi_startproc + # _vpaes_encrypt_core and _vpaes_encrypt_core_2x expect the key in %rdx. + xchg $key, $blocks +___ +($blocks,$key)=($key,$blocks); +$code.=<<___; + test $blocks, $blocks + jz .Lctr32_abort +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lctr32_body: +___ +$code.=<<___; + movdqu ($ivp), %xmm0 # Load IV. + movdqa .Lctr_add_one(%rip), %xmm8 + sub $inp, $out # This allows only incrementing $inp. + call _vpaes_preheat + movdqa %xmm0, %xmm6 + pshufb .Lrev_ctr(%rip), %xmm6 + + test \$1, $blocks + jz .Lctr32_prep_loop + + # Handle one block so the remaining block count is even for + # _vpaes_encrypt_core_2x. + movdqu ($inp), %xmm7 # Load input. + call _vpaes_encrypt_core + pxor %xmm7, %xmm0 + paddd %xmm8, %xmm6 + movdqu %xmm0, ($out,$inp) + sub \$1, $blocks + lea 16($inp), $inp + jz .Lctr32_done + +.Lctr32_prep_loop: + # _vpaes_encrypt_core_2x leaves only %xmm14 and %xmm15 as spare + # registers. We maintain two byte-swapped counters in them. + movdqa %xmm6, %xmm14 + movdqa %xmm6, %xmm15 + paddd %xmm8, %xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip), %xmm1 # Set up counters. + movdqa %xmm14, %xmm0 + movdqa %xmm15, %xmm6 + pshufb %xmm1, %xmm0 + pshufb %xmm1, %xmm6 + call _vpaes_encrypt_core_2x + movdqu ($inp), %xmm1 # Load input. + movdqu 16($inp), %xmm2 + movdqa .Lctr_add_two(%rip), %xmm3 + pxor %xmm1, %xmm0 # XOR input. + pxor %xmm2, %xmm6 + paddd %xmm3, %xmm14 # Increment counters. + paddd %xmm3, %xmm15 + movdqu %xmm0, ($out,$inp) # Write output. + movdqu %xmm6, 16($out,$inp) + sub \$2, $blocks # Advance loop. + lea 32($inp), $inp + jnz .Lctr32_loop + +.Lctr32_done: +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lctr32_epilogue: +___ +$code.=<<___; +.Lctr32_abort: + ret +.cfi_endproc +.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks +___ +} $code.=<<___; ## ## _aes_preheat @@ -1107,6 +1387,17 @@ _vpaes_consts: .Lk_dsbo: # decryption sbox final output .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + +# .Lrev_ctr is a permutation which byte-swaps the counter portion of the IV. +.Lrev_ctr: + .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 +# .Lctr_add_* may be added to a byte-swapped xmm register to increment the +# counter. The register must be byte-swapped again to form the actual input. +.Lctr_add_one: + .quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: + .quad 0x0000000000000000, 0x0000000200000000 + .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" .align 64 .size _vpaes_consts,.-_vpaes_consts @@ -1222,6 +1513,10 @@ se_handler: .rva .LSEH_end_${PREFIX}_cbc_encrypt .rva .LSEH_info_${PREFIX}_cbc_encrypt + .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks + .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks + .rva .LSEH_info_${PREFIX}_ctr32_encrypt_blocks + .section .xdata .align 8 .LSEH_info_${PREFIX}_set_encrypt_key: @@ -1244,6 +1539,10 @@ se_handler: .byte 9,0,0,0 .rva se_handler .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_ctr32_encrypt_blocks: + .byte 9,0,0,0 + .rva se_handler + .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] ___ } diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h index 63070bc6..0cebb04c 100644 --- a/src/crypto/fipsmodule/aes/internal.h +++ b/src/crypto/fipsmodule/aes/internal.h @@ -35,15 +35,13 @@ OPENSSL_INLINE int hwaes_capable(void) { } #define VPAES +#if defined(OPENSSL_X86_64) +#define VPAES_CTR32 +#endif OPENSSL_INLINE int vpaes_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; } -#if defined(OPENSSL_X86_64) -#define BSAES -OPENSSL_INLINE int bsaes_capable(void) { return vpaes_capable(); } -#endif // X86_64 - #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) #define HWAES diff --git a/src/crypto/fipsmodule/bn/ctx.c b/src/crypto/fipsmodule/bn/ctx.c index af50de93..1926e80b 100644 --- a/src/crypto/fipsmodule/bn/ctx.c +++ b/src/crypto/fipsmodule/bn/ctx.c @@ -54,6 +54,7 @@ #include <openssl/bn.h> +#include <assert.h> #include <string.h> #include <openssl/err.h> @@ -62,63 +63,46 @@ #include "../../internal.h" -// How many bignums are in each "pool item"; -#define BN_CTX_POOL_SIZE 16 // The stack frame info is resizing, set a first-time expansion size; #define BN_CTX_START_FRAMES 32 -// A bundle of bignums that can be linked with other bundles -typedef struct bignum_pool_item { - // The bignum values - BIGNUM vals[BN_CTX_POOL_SIZE]; - // Linked-list admin - struct bignum_pool_item *prev, *next; -} BN_POOL_ITEM; - - -typedef struct bignum_pool { - // Linked-list admin - BN_POOL_ITEM *head, *current, *tail; - // Stack depth and allocation size - unsigned used, size; -} BN_POOL; - -static void BN_POOL_init(BN_POOL *); -static void BN_POOL_finish(BN_POOL *); -static BIGNUM *BN_POOL_get(BN_POOL *); -static void BN_POOL_release(BN_POOL *, unsigned int); - // BN_STACK -// A wrapper to manage the "stack frames" -typedef struct bignum_ctx_stack { - // Array of indexes into the bignum stack - unsigned int *indexes; +// A |BN_STACK| is a stack of |size_t| values. +typedef struct { + // Array of indexes into |ctx->bignums|. + size_t *indexes; // Number of stack frames, and the size of the allocated array - unsigned int depth, size; + size_t depth, size; } BN_STACK; -static void BN_STACK_init(BN_STACK *); -static void BN_STACK_finish(BN_STACK *); -static int BN_STACK_push(BN_STACK *, unsigned int); -static unsigned int BN_STACK_pop(BN_STACK *); +static void BN_STACK_init(BN_STACK *); +static void BN_STACK_cleanup(BN_STACK *); +static int BN_STACK_push(BN_STACK *, size_t idx); +static size_t BN_STACK_pop(BN_STACK *); // BN_CTX +DEFINE_STACK_OF(BIGNUM) + // The opaque BN_CTX type struct bignum_ctx { - // The bignum bundles - BN_POOL pool; - // The "stack frames", if you will + // bignums is the stack of |BIGNUM|s managed by this |BN_CTX|. + STACK_OF(BIGNUM) *bignums; + // stack is the stack of |BN_CTX_start| frames. It is the value of |used| at + // the time |BN_CTX_start| was called. BN_STACK stack; - // The number of bignums currently assigned - unsigned int used; - // Depth of stack overflow - int err_stack; - // Block "gets" until an "end" (compatibility behaviour) - int too_many; + // used is the number of |BIGNUM|s from |bignums| that have been used. + size_t used; + // error is one if any operation on this |BN_CTX| failed. All subsequent + // operations will fail. + char error; + // defer_error is one if an operation on this |BN_CTX| has failed, but no + // error has been pushed to the queue yet. This is used to defer errors from + // |BN_CTX_start| to |BN_CTX_get|. + char defer_error; }; BN_CTX *BN_CTX_new(void) { @@ -129,11 +113,11 @@ BN_CTX *BN_CTX_new(void) { } // Initialise the structure - BN_POOL_init(&ret->pool); + ret->bignums = NULL; BN_STACK_init(&ret->stack); ret->used = 0; - ret->err_stack = 0; - ret->too_many = 0; + ret->error = 0; + ret->defer_error = 0; return ret; } @@ -142,57 +126,69 @@ void BN_CTX_free(BN_CTX *ctx) { return; } - BN_STACK_finish(&ctx->stack); - BN_POOL_finish(&ctx->pool); + sk_BIGNUM_pop_free(ctx->bignums, BN_free); + BN_STACK_cleanup(&ctx->stack); OPENSSL_free(ctx); } void BN_CTX_start(BN_CTX *ctx) { - // If we're already overflowing ... - if (ctx->err_stack || ctx->too_many) { - ctx->err_stack++; - } else if (!BN_STACK_push(&ctx->stack, ctx->used)) { - // (Try to) get a new frame pointer - OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); - ctx->err_stack++; + if (ctx->error) { + // Once an operation has failed, |ctx->stack| no longer matches the number + // of |BN_CTX_end| calls to come. Do nothing. + return; + } + + if (!BN_STACK_push(&ctx->stack, ctx->used)) { + ctx->error = 1; + // |BN_CTX_start| cannot fail, so defer the error to |BN_CTX_get|. + ctx->defer_error = 1; } } BIGNUM *BN_CTX_get(BN_CTX *ctx) { - BIGNUM *ret; - if (ctx->err_stack || ctx->too_many) { + // Once any operation has failed, they all do. + if (ctx->error) { + if (ctx->defer_error) { + OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); + ctx->defer_error = 0; + } return NULL; } - ret = BN_POOL_get(&ctx->pool); - if (ret == NULL) { - // Setting too_many prevents repeated "get" attempts from - // cluttering the error stack. - ctx->too_many = 1; - OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); - return NULL; + if (ctx->bignums == NULL) { + ctx->bignums = sk_BIGNUM_new_null(); + if (ctx->bignums == NULL) { + OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); + ctx->error = 1; + return NULL; + } } - // OK, make sure the returned bignum is "zero" + if (ctx->used == sk_BIGNUM_num(ctx->bignums)) { + BIGNUM *bn = BN_new(); + if (bn == NULL || !sk_BIGNUM_push(ctx->bignums, bn)) { + OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); + BN_free(bn); + ctx->error = 1; + return NULL; + } + } + + BIGNUM *ret = sk_BIGNUM_value(ctx->bignums, ctx->used); BN_zero(ret); + // This is bounded by |sk_BIGNUM_num|, so it cannot overflow. ctx->used++; return ret; } void BN_CTX_end(BN_CTX *ctx) { - if (ctx->err_stack) { - ctx->err_stack--; - } else { - unsigned int fp = BN_STACK_pop(&ctx->stack); - // Does this stack frame have anything to release? - if (fp < ctx->used) { - BN_POOL_release(&ctx->pool, ctx->used - fp); - } - - ctx->used = fp; - // Unjam "too_many" in case "get" had failed - ctx->too_many = 0; + if (ctx->error) { + // Once an operation has failed, |ctx->stack| no longer matches the number + // of |BN_CTX_end| calls to come. Do nothing. + return; } + + ctx->used = BN_STACK_pop(&ctx->stack); } @@ -203,101 +199,34 @@ static void BN_STACK_init(BN_STACK *st) { st->depth = st->size = 0; } -static void BN_STACK_finish(BN_STACK *st) { +static void BN_STACK_cleanup(BN_STACK *st) { OPENSSL_free(st->indexes); } -static int BN_STACK_push(BN_STACK *st, unsigned int idx) { +static int BN_STACK_push(BN_STACK *st, size_t idx) { if (st->depth == st->size) { - // Need to expand - unsigned int newsize = - (st->size ? (st->size * 3 / 2) : BN_CTX_START_FRAMES); - unsigned int *newitems = OPENSSL_malloc(newsize * sizeof(unsigned int)); - if (!newitems) { + // This function intentionally does not push to the error queue on error. + // Error-reporting is deferred to |BN_CTX_get|. + size_t new_size = st->size != 0 ? st->size * 3 / 2 : BN_CTX_START_FRAMES; + if (new_size <= st->size || new_size > ((size_t)-1) / sizeof(size_t)) { return 0; } - if (st->depth) { - OPENSSL_memcpy(newitems, st->indexes, st->depth * sizeof(unsigned int)); + size_t *new_indexes = + OPENSSL_realloc(st->indexes, new_size * sizeof(size_t)); + if (new_indexes == NULL) { + return 0; } - OPENSSL_free(st->indexes); - st->indexes = newitems; - st->size = newsize; + st->indexes = new_indexes; + st->size = new_size; } - st->indexes[(st->depth)++] = idx; + st->indexes[st->depth] = idx; + st->depth++; return 1; } -static unsigned int BN_STACK_pop(BN_STACK *st) { - return st->indexes[--(st->depth)]; -} - - -static void BN_POOL_init(BN_POOL *p) { - p->head = p->current = p->tail = NULL; - p->used = p->size = 0; -} - -static void BN_POOL_finish(BN_POOL *p) { - while (p->head) { - for (size_t i = 0; i < BN_CTX_POOL_SIZE; i++) { - BN_clear_free(&p->head->vals[i]); - } - - p->current = p->head->next; - OPENSSL_free(p->head); - p->head = p->current; - } -} - -static BIGNUM *BN_POOL_get(BN_POOL *p) { - if (p->used == p->size) { - BN_POOL_ITEM *item = OPENSSL_malloc(sizeof(BN_POOL_ITEM)); - if (!item) { - return NULL; - } - - // Initialise the structure - for (size_t i = 0; i < BN_CTX_POOL_SIZE; i++) { - BN_init(&item->vals[i]); - } - - item->prev = p->tail; - item->next = NULL; - // Link it in - if (!p->head) { - p->head = p->current = p->tail = item; - } else { - p->tail->next = item; - p->tail = item; - p->current = item; - } - - p->size += BN_CTX_POOL_SIZE; - p->used++; - // Return the first bignum from the new pool - return item->vals; - } - - if (!p->used) { - p->current = p->head; - } else if ((p->used % BN_CTX_POOL_SIZE) == 0) { - p->current = p->current->next; - } - - return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE); -} - -static void BN_POOL_release(BN_POOL *p, unsigned int num) { - unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE; - p->used -= num; - - while (num--) { - if (!offset) { - offset = BN_CTX_POOL_SIZE - 1; - p->current = p->current->prev; - } else { - offset--; - } - } +static size_t BN_STACK_pop(BN_STACK *st) { + assert(st->depth > 0); + st->depth--; + return st->indexes[st->depth]; } diff --git a/src/crypto/fipsmodule/bn/exponentiation.c b/src/crypto/fipsmodule/bn/exponentiation.c index 9e408113..8d4a5c8b 100644 --- a/src/crypto/fipsmodule/bn/exponentiation.c +++ b/src/crypto/fipsmodule/bn/exponentiation.c @@ -614,10 +614,9 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, BN_MONT_CTX *new_mont = NULL; BN_CTX_start(ctx); - BIGNUM *d = BN_CTX_get(ctx); BIGNUM *r = BN_CTX_get(ctx); val[0] = BN_CTX_get(ctx); - if (!d || !r || !val[0]) { + if (r == NULL || val[0] == NULL) { goto err; } @@ -639,7 +638,9 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, goto err; } if (window > 1) { - if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { + BIGNUM *d = BN_CTX_get(ctx); + if (d == NULL || + !BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { goto err; } for (int i = 1; i < 1 << (window - 1); i++) { diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c index a1859d74..dc94166c 100644 --- a/src/crypto/fipsmodule/cipher/e_aes.c +++ b/src/crypto/fipsmodule/cipher/e_aes.c @@ -230,7 +230,7 @@ static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, block128_f *out_block, const uint8_t *key, - size_t key_bytes, int large_inputs) { + size_t key_bytes) { if (hwaes_capable()) { aes_hw_set_encrypt_key(key, key_bytes * 8, aes_key); if (gcm_key != NULL) { @@ -242,9 +242,7 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, return aes_hw_ctr32_encrypt_blocks; } - const int bsaes_ok = bsaes_capable(); - const int vpaes_ok = vpaes_capable(); - if (bsaes_ok && (large_inputs || !vpaes_ok)) { + if (bsaes_capable()) { aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key); if (gcm_key != NULL) { CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0); @@ -255,7 +253,7 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, return bsaes_ctr32_encrypt_blocks; } - if (vpaes_ok) { + if (vpaes_capable()) { vpaes_set_encrypt_key(key, key_bytes * 8, aes_key); if (out_block) { *out_block = vpaes_encrypt; @@ -317,7 +315,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, if (key) { OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm)); gctx->ctr = aes_ctr_set_key(&gctx->ks.ks, &gctx->gcm.gcm_key, NULL, key, - ctx->key_len, 1 /* large inputs */); + ctx->key_len); // If we have an iv can set it directly, otherwise use saved IV. if (iv == NULL && gctx->iv_set) { iv = gctx->iv; @@ -860,8 +858,8 @@ static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx, return 0; } - gcm_ctx->ctr = aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key, - key_len, 1 /* large inputs */); + gcm_ctx->ctr = + aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key, key_len); *out_tag_len = tag_len; return 1; } diff --git a/src/crypto/fipsmodule/cipher/internal.h b/src/crypto/fipsmodule/cipher/internal.h index b9e61ec8..68efe33d 100644 --- a/src/crypto/fipsmodule/cipher/internal.h +++ b/src/crypto/fipsmodule/cipher/internal.h @@ -117,11 +117,9 @@ struct evp_aead_st { // set to a function that encrypts single blocks. If not NULL, |*gcm_key| is // initialised to do GHASH with the given key. It returns a function for // optimised CTR-mode, or NULL if CTR-mode should be built using |*out_block|. -// |large_input| is a hint to select AES implementations. If it is one, the -// caller expects this key to be used with large inputs. ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, block128_f *out_block, const uint8_t *key, - size_t key_bytes, int large_input); + size_t key_bytes); #if defined(__cplusplus) } // extern C diff --git a/src/crypto/fipsmodule/rand/ctrdrbg.c b/src/crypto/fipsmodule/rand/ctrdrbg.c index 418f56b6..b2fda1da 100644 --- a/src/crypto/fipsmodule/rand/ctrdrbg.c +++ b/src/crypto/fipsmodule/rand/ctrdrbg.c @@ -57,12 +57,7 @@ int CTR_DRBG_init(CTR_DRBG_STATE *drbg, seed_material[i] ^= kInitMask[i]; } - // |RAND_bytes| is rarely called with large enough inputs for bsaes to be - // faster than vpaes. bsaes also currently has side channel trade offs - // (https://crbug.com/boringssl/256), which we should especially avoid in the - // PRNG. (Note the size hint is a no-op on machines with AES instructions.) - drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, seed_material, 32, - 0 /* small inputs */); + drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, seed_material, 32); OPENSSL_memcpy(drbg->counter.bytes, seed_material + 32, 16); drbg->reseed_counter = 1; @@ -98,8 +93,7 @@ static int ctr_drbg_update(CTR_DRBG_STATE *drbg, const uint8_t *data, temp[i] ^= data[i]; } - drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, temp, 32, - 0 /* small inputs */); + drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, temp, 32); OPENSSL_memcpy(drbg->counter.bytes, temp + 32, 16); return 1; diff --git a/src/crypto/impl_dispatch_test.cc b/src/crypto/impl_dispatch_test.cc index f1192a7e..54ee704c 100644 --- a/src/crypto/impl_dispatch_test.cc +++ b/src/crypto/impl_dispatch_test.cc @@ -89,7 +89,6 @@ constexpr size_t kFlag_aesni_gcm_encrypt = 2; constexpr size_t kFlag_aes_hw_set_encrypt_key = 3; constexpr size_t kFlag_vpaes_encrypt = 4; constexpr size_t kFlag_vpaes_set_encrypt_key = 5; -constexpr size_t kFlag_bsaes_ctr32_encrypt_blocks = 6; TEST_F(ImplDispatchTest, AEAD_AES_GCM) { AssertFunctionsHit( @@ -98,9 +97,8 @@ TEST_F(ImplDispatchTest, AEAD_AES_GCM) { {kFlag_aes_hw_encrypt, aesni_}, {kFlag_aes_hw_set_encrypt_key, aesni_}, {kFlag_aesni_gcm_encrypt, is_x86_64_ && aesni_ && avx_movbe_}, - {kFlag_vpaes_encrypt, !is_x86_64_ && ssse3_ && !aesni_}, - {kFlag_vpaes_set_encrypt_key, !is_x86_64_ && ssse3_ && !aesni_}, - {kFlag_bsaes_ctr32_encrypt_blocks, is_x86_64_ && ssse3_ && !aesni_}, + {kFlag_vpaes_encrypt, ssse3_ && !aesni_}, + {kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_}, }, [] { const uint8_t kZeros[16] = {0}; @@ -123,7 +121,6 @@ TEST_F(ImplDispatchTest, AES_set_encrypt_key) { { {kFlag_aes_hw_set_encrypt_key, aesni_}, {kFlag_vpaes_set_encrypt_key, ssse3_ && !aesni_}, - // BSAES will not be used for the |AES_*| functions. }, [] { AES_KEY key; @@ -141,7 +138,6 @@ TEST_F(ImplDispatchTest, AES_single_block) { { {kFlag_aes_hw_encrypt, aesni_}, {kFlag_vpaes_encrypt, ssse3_ && !aesni_}, - // BSAES will not be used for the |AES_*| functions. }, [&key] { uint8_t in[AES_BLOCK_SIZE] = {0}; diff --git a/src/include/openssl/asn1.h b/src/include/openssl/asn1.h index 8b61eaa3..6ae831b8 100644 --- a/src/include/openssl/asn1.h +++ b/src/include/openssl/asn1.h @@ -666,7 +666,6 @@ OPENSSL_EXPORT int d2i_ASN1_BOOLEAN(int *a,const unsigned char **pp,long lengt DECLARE_ASN1_FUNCTIONS(ASN1_INTEGER) OPENSSL_EXPORT int i2c_ASN1_INTEGER(ASN1_INTEGER *a,unsigned char **pp); OPENSSL_EXPORT ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **a,const unsigned char **pp, long length); -OPENSSL_EXPORT ASN1_INTEGER *d2i_ASN1_UINTEGER(ASN1_INTEGER **a,const unsigned char **pp, long length); OPENSSL_EXPORT ASN1_INTEGER * ASN1_INTEGER_dup(const ASN1_INTEGER *x); OPENSSL_EXPORT int ASN1_INTEGER_cmp(const ASN1_INTEGER *x, const ASN1_INTEGER *y); diff --git a/src/tool/speed.cc b/src/tool/speed.cc index 14379cd4..a0fc905d 100644 --- a/src/tool/speed.cc +++ b/src/tool/speed.cc @@ -99,7 +99,7 @@ static uint64_t time_now() { #endif static uint64_t g_timeout_seconds = 1; -static std::vector<size_t> g_chunk_lengths = {16, 256, 1350, 8192}; +static std::vector<size_t> g_chunk_lengths = {16, 256, 1350, 8192, 16384}; static bool TimeFunction(TimeResults *results, std::function<bool()> func) { // total_us is the total amount of time that we'll aim to measure a function @@ -846,7 +846,7 @@ static const struct argument kArguments[] = { "-chunks", kOptionalArgument, "A comma-separated list of input sizes to run tests at (default is " - "16,256,1350,8192)", + "16,256,1350,8192,16384)", }, { "", diff --git a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm deleted file mode 100644 index 5fa4053e..00000000 --- a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm +++ /dev/null @@ -1,1777 +0,0 @@ -; This file is generated from a similarly-named Perl script in the BoringSSL -; source tree. Do not edit by hand. - -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD - -%ifdef BORINGSSL_PREFIX -%include "boringssl_prefix_symbols_nasm.inc" -%endif -section .text code align=64 - - - -ALIGN 64 -_bsaes_encrypt8: - - lea r11,[$L$BS0] - - movdqa xmm8,XMMWORD[rax] - lea rax,[16+rax] - movdqa xmm7,XMMWORD[80+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 -_bsaes_encrypt8_bitslice: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm3 - psrlq xmm3,1 - pxor xmm5,xmm6 - pxor xmm3,xmm4 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm6,xmm5 - psllq xmm5,1 - pxor xmm4,xmm3 - psllq xmm3,1 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm1,xmm2 - pxor xmm15,xmm0 - pand xmm1,xmm7 - pand xmm15,xmm7 - pxor xmm2,xmm1 - psllq xmm1,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm1,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm4 - psrlq xmm4,2 - movdqa xmm10,xmm3 - psrlq xmm3,2 - pxor xmm4,xmm6 - pxor xmm3,xmm5 - pand xmm4,xmm8 - pand xmm3,xmm8 - pxor xmm6,xmm4 - psllq xmm4,2 - pxor xmm5,xmm3 - psllq xmm3,2 - pxor xmm4,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm2 - pxor xmm15,xmm1 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm2,xmm0 - psllq xmm0,2 - pxor xmm1,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm2 - psrlq xmm2,4 - movdqa xmm10,xmm1 - psrlq xmm1,4 - pxor xmm2,xmm6 - pxor xmm1,xmm5 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm6,xmm2 - psllq xmm2,4 - pxor xmm5,xmm1 - psllq xmm1,4 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm4 - pxor xmm15,xmm3 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm4,xmm0 - psllq xmm0,4 - pxor xmm3,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - dec r10d - jmp NEAR $L$enc_sbox -ALIGN 16 -$L$enc_loop: - pxor xmm15,XMMWORD[rax] - pxor xmm0,XMMWORD[16+rax] - pxor xmm1,XMMWORD[32+rax] - pxor xmm2,XMMWORD[48+rax] -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,XMMWORD[64+rax] - pxor xmm4,XMMWORD[80+rax] -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,XMMWORD[96+rax] - pxor xmm6,XMMWORD[112+rax] -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea rax,[128+rax] -$L$enc_sbox: - pxor xmm4,xmm5 - pxor xmm1,xmm0 - pxor xmm2,xmm15 - pxor xmm5,xmm1 - pxor xmm4,xmm15 - - pxor xmm5,xmm2 - pxor xmm2,xmm6 - pxor xmm6,xmm4 - pxor xmm2,xmm3 - pxor xmm3,xmm4 - pxor xmm2,xmm0 - - pxor xmm1,xmm6 - pxor xmm0,xmm4 - movdqa xmm10,xmm6 - movdqa xmm9,xmm0 - movdqa xmm8,xmm4 - movdqa xmm12,xmm1 - movdqa xmm11,xmm5 - - pxor xmm10,xmm3 - pxor xmm9,xmm1 - pxor xmm8,xmm2 - movdqa xmm13,xmm10 - pxor xmm12,xmm3 - movdqa xmm7,xmm9 - pxor xmm11,xmm15 - movdqa xmm14,xmm10 - - por xmm9,xmm8 - por xmm10,xmm11 - pxor xmm14,xmm7 - pand xmm13,xmm11 - pxor xmm11,xmm8 - pand xmm7,xmm8 - pand xmm14,xmm11 - movdqa xmm11,xmm2 - pxor xmm11,xmm15 - pand xmm12,xmm11 - pxor xmm10,xmm12 - pxor xmm9,xmm12 - movdqa xmm12,xmm6 - movdqa xmm11,xmm4 - pxor xmm12,xmm0 - pxor xmm11,xmm5 - movdqa xmm8,xmm12 - pand xmm12,xmm11 - por xmm8,xmm11 - pxor xmm7,xmm12 - pxor xmm10,xmm14 - pxor xmm9,xmm13 - pxor xmm8,xmm14 - movdqa xmm11,xmm1 - pxor xmm7,xmm13 - movdqa xmm12,xmm3 - pxor xmm8,xmm13 - movdqa xmm13,xmm0 - pand xmm11,xmm2 - movdqa xmm14,xmm6 - pand xmm12,xmm15 - pand xmm13,xmm4 - por xmm14,xmm5 - pxor xmm10,xmm11 - pxor xmm9,xmm12 - pxor xmm8,xmm13 - pxor xmm7,xmm14 - - - - - - movdqa xmm11,xmm10 - pand xmm10,xmm8 - pxor xmm11,xmm9 - - movdqa xmm13,xmm7 - movdqa xmm14,xmm11 - pxor xmm13,xmm10 - pand xmm14,xmm13 - - movdqa xmm12,xmm8 - pxor xmm14,xmm9 - pxor xmm12,xmm7 - - pxor xmm10,xmm9 - - pand xmm12,xmm10 - - movdqa xmm9,xmm13 - pxor xmm12,xmm7 - - pxor xmm9,xmm12 - pxor xmm8,xmm12 - - pand xmm9,xmm7 - - pxor xmm13,xmm9 - pxor xmm8,xmm9 - - pand xmm13,xmm14 - - pxor xmm13,xmm11 - movdqa xmm11,xmm5 - movdqa xmm7,xmm4 - movdqa xmm9,xmm14 - pxor xmm9,xmm13 - pand xmm9,xmm5 - pxor xmm5,xmm4 - pand xmm4,xmm14 - pand xmm5,xmm13 - pxor xmm5,xmm4 - pxor xmm4,xmm9 - pxor xmm11,xmm15 - pxor xmm7,xmm2 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm15 - pxor xmm11,xmm7 - pxor xmm15,xmm2 - pand xmm7,xmm14 - pand xmm2,xmm12 - pand xmm11,xmm13 - pand xmm15,xmm8 - pxor xmm7,xmm11 - pxor xmm15,xmm2 - pxor xmm11,xmm10 - pxor xmm2,xmm9 - pxor xmm5,xmm11 - pxor xmm15,xmm11 - pxor xmm4,xmm7 - pxor xmm2,xmm7 - - movdqa xmm11,xmm6 - movdqa xmm7,xmm0 - pxor xmm11,xmm3 - pxor xmm7,xmm1 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm3 - pxor xmm11,xmm7 - pxor xmm3,xmm1 - pand xmm7,xmm14 - pand xmm1,xmm12 - pand xmm11,xmm13 - pand xmm3,xmm8 - pxor xmm7,xmm11 - pxor xmm3,xmm1 - pxor xmm11,xmm10 - pxor xmm1,xmm9 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - pxor xmm10,xmm13 - pand xmm10,xmm6 - pxor xmm6,xmm0 - pand xmm0,xmm14 - pand xmm6,xmm13 - pxor xmm6,xmm0 - pxor xmm0,xmm10 - pxor xmm6,xmm11 - pxor xmm3,xmm11 - pxor xmm0,xmm7 - pxor xmm1,xmm7 - pxor xmm6,xmm15 - pxor xmm0,xmm5 - pxor xmm3,xmm6 - pxor xmm5,xmm15 - pxor xmm15,xmm0 - - pxor xmm0,xmm4 - pxor xmm4,xmm1 - pxor xmm1,xmm2 - pxor xmm2,xmm4 - pxor xmm3,xmm4 - - pxor xmm5,xmm2 - dec r10d - jl NEAR $L$enc_done - pshufd xmm7,xmm15,0x93 - pshufd xmm8,xmm0,0x93 - pxor xmm15,xmm7 - pshufd xmm9,xmm3,0x93 - pxor xmm0,xmm8 - pshufd xmm10,xmm5,0x93 - pxor xmm3,xmm9 - pshufd xmm11,xmm2,0x93 - pxor xmm5,xmm10 - pshufd xmm12,xmm6,0x93 - pxor xmm2,xmm11 - pshufd xmm13,xmm1,0x93 - pxor xmm6,xmm12 - pshufd xmm14,xmm4,0x93 - pxor xmm1,xmm13 - pxor xmm4,xmm14 - - pxor xmm8,xmm15 - pxor xmm7,xmm4 - pxor xmm8,xmm4 - pshufd xmm15,xmm15,0x4E - pxor xmm9,xmm0 - pshufd xmm0,xmm0,0x4E - pxor xmm12,xmm2 - pxor xmm15,xmm7 - pxor xmm13,xmm6 - pxor xmm0,xmm8 - pxor xmm11,xmm5 - pshufd xmm7,xmm2,0x4E - pxor xmm14,xmm1 - pshufd xmm8,xmm6,0x4E - pxor xmm10,xmm3 - pshufd xmm2,xmm5,0x4E - pxor xmm10,xmm4 - pshufd xmm6,xmm4,0x4E - pxor xmm11,xmm4 - pshufd xmm5,xmm1,0x4E - pxor xmm7,xmm11 - pshufd xmm1,xmm3,0x4E - pxor xmm8,xmm12 - pxor xmm2,xmm10 - pxor xmm6,xmm14 - pxor xmm5,xmm13 - movdqa xmm3,xmm7 - pxor xmm1,xmm9 - movdqa xmm4,xmm8 - movdqa xmm7,XMMWORD[48+r11] - jnz NEAR $L$enc_loop - movdqa xmm7,XMMWORD[64+r11] - jmp NEAR $L$enc_loop -ALIGN 16 -$L$enc_done: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm2 - psrlq xmm2,1 - pxor xmm1,xmm4 - pxor xmm2,xmm6 - pand xmm1,xmm7 - pand xmm2,xmm7 - pxor xmm4,xmm1 - psllq xmm1,1 - pxor xmm6,xmm2 - psllq xmm2,1 - pxor xmm1,xmm9 - pxor xmm2,xmm10 - movdqa xmm9,xmm3 - psrlq xmm3,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm3,xmm5 - pxor xmm15,xmm0 - pand xmm3,xmm7 - pand xmm15,xmm7 - pxor xmm5,xmm3 - psllq xmm3,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm3,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm6 - psrlq xmm6,2 - movdqa xmm10,xmm2 - psrlq xmm2,2 - pxor xmm6,xmm4 - pxor xmm2,xmm1 - pand xmm6,xmm8 - pand xmm2,xmm8 - pxor xmm4,xmm6 - psllq xmm6,2 - pxor xmm1,xmm2 - psllq xmm2,2 - pxor xmm6,xmm9 - pxor xmm2,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm5 - pxor xmm15,xmm3 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm5,xmm0 - psllq xmm0,2 - pxor xmm3,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm5 - psrlq xmm5,4 - movdqa xmm10,xmm3 - psrlq xmm3,4 - pxor xmm5,xmm4 - pxor xmm3,xmm1 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm4,xmm5 - psllq xmm5,4 - pxor xmm1,xmm3 - psllq xmm3,4 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm6 - pxor xmm15,xmm2 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm6,xmm0 - psllq xmm0,4 - pxor xmm2,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[rax] - pxor xmm3,xmm7 - pxor xmm5,xmm7 - pxor xmm2,xmm7 - pxor xmm6,xmm7 - pxor xmm1,xmm7 - pxor xmm4,xmm7 - pxor xmm15,xmm7 - pxor xmm0,xmm7 - DB 0F3h,0C3h ;repret - - - - -ALIGN 64 -_bsaes_decrypt8: - - lea r11,[$L$BS0] - - movdqa xmm8,XMMWORD[rax] - lea rax,[16+rax] - movdqa xmm7,XMMWORD[((-48))+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm3 - psrlq xmm3,1 - pxor xmm5,xmm6 - pxor xmm3,xmm4 - pand xmm5,xmm7 - pand xmm3,xmm7 - pxor xmm6,xmm5 - psllq xmm5,1 - pxor xmm4,xmm3 - psllq xmm3,1 - pxor xmm5,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm1 - psrlq xmm1,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm1,xmm2 - pxor xmm15,xmm0 - pand xmm1,xmm7 - pand xmm15,xmm7 - pxor xmm2,xmm1 - psllq xmm1,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm1,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm4 - psrlq xmm4,2 - movdqa xmm10,xmm3 - psrlq xmm3,2 - pxor xmm4,xmm6 - pxor xmm3,xmm5 - pand xmm4,xmm8 - pand xmm3,xmm8 - pxor xmm6,xmm4 - psllq xmm4,2 - pxor xmm5,xmm3 - psllq xmm3,2 - pxor xmm4,xmm9 - pxor xmm3,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm2 - pxor xmm15,xmm1 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm2,xmm0 - psllq xmm0,2 - pxor xmm1,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm2 - psrlq xmm2,4 - movdqa xmm10,xmm1 - psrlq xmm1,4 - pxor xmm2,xmm6 - pxor xmm1,xmm5 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm6,xmm2 - psllq xmm2,4 - pxor xmm5,xmm1 - psllq xmm1,4 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm4 - pxor xmm15,xmm3 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm4,xmm0 - psllq xmm0,4 - pxor xmm3,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - dec r10d - jmp NEAR $L$dec_sbox -ALIGN 16 -$L$dec_loop: - pxor xmm15,XMMWORD[rax] - pxor xmm0,XMMWORD[16+rax] - pxor xmm1,XMMWORD[32+rax] - pxor xmm2,XMMWORD[48+rax] -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,XMMWORD[64+rax] - pxor xmm4,XMMWORD[80+rax] -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,XMMWORD[96+rax] - pxor xmm6,XMMWORD[112+rax] -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea rax,[128+rax] -$L$dec_sbox: - pxor xmm2,xmm3 - - pxor xmm3,xmm6 - pxor xmm1,xmm6 - pxor xmm5,xmm3 - pxor xmm6,xmm5 - pxor xmm0,xmm6 - - pxor xmm15,xmm0 - pxor xmm1,xmm4 - pxor xmm2,xmm15 - pxor xmm4,xmm15 - pxor xmm0,xmm2 - movdqa xmm10,xmm2 - movdqa xmm9,xmm6 - movdqa xmm8,xmm0 - movdqa xmm12,xmm3 - movdqa xmm11,xmm4 - - pxor xmm10,xmm15 - pxor xmm9,xmm3 - pxor xmm8,xmm5 - movdqa xmm13,xmm10 - pxor xmm12,xmm15 - movdqa xmm7,xmm9 - pxor xmm11,xmm1 - movdqa xmm14,xmm10 - - por xmm9,xmm8 - por xmm10,xmm11 - pxor xmm14,xmm7 - pand xmm13,xmm11 - pxor xmm11,xmm8 - pand xmm7,xmm8 - pand xmm14,xmm11 - movdqa xmm11,xmm5 - pxor xmm11,xmm1 - pand xmm12,xmm11 - pxor xmm10,xmm12 - pxor xmm9,xmm12 - movdqa xmm12,xmm2 - movdqa xmm11,xmm0 - pxor xmm12,xmm6 - pxor xmm11,xmm4 - movdqa xmm8,xmm12 - pand xmm12,xmm11 - por xmm8,xmm11 - pxor xmm7,xmm12 - pxor xmm10,xmm14 - pxor xmm9,xmm13 - pxor xmm8,xmm14 - movdqa xmm11,xmm3 - pxor xmm7,xmm13 - movdqa xmm12,xmm15 - pxor xmm8,xmm13 - movdqa xmm13,xmm6 - pand xmm11,xmm5 - movdqa xmm14,xmm2 - pand xmm12,xmm1 - pand xmm13,xmm0 - por xmm14,xmm4 - pxor xmm10,xmm11 - pxor xmm9,xmm12 - pxor xmm8,xmm13 - pxor xmm7,xmm14 - - - - - - movdqa xmm11,xmm10 - pand xmm10,xmm8 - pxor xmm11,xmm9 - - movdqa xmm13,xmm7 - movdqa xmm14,xmm11 - pxor xmm13,xmm10 - pand xmm14,xmm13 - - movdqa xmm12,xmm8 - pxor xmm14,xmm9 - pxor xmm12,xmm7 - - pxor xmm10,xmm9 - - pand xmm12,xmm10 - - movdqa xmm9,xmm13 - pxor xmm12,xmm7 - - pxor xmm9,xmm12 - pxor xmm8,xmm12 - - pand xmm9,xmm7 - - pxor xmm13,xmm9 - pxor xmm8,xmm9 - - pand xmm13,xmm14 - - pxor xmm13,xmm11 - movdqa xmm11,xmm4 - movdqa xmm7,xmm0 - movdqa xmm9,xmm14 - pxor xmm9,xmm13 - pand xmm9,xmm4 - pxor xmm4,xmm0 - pand xmm0,xmm14 - pand xmm4,xmm13 - pxor xmm4,xmm0 - pxor xmm0,xmm9 - pxor xmm11,xmm1 - pxor xmm7,xmm5 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm1 - pxor xmm11,xmm7 - pxor xmm1,xmm5 - pand xmm7,xmm14 - pand xmm5,xmm12 - pand xmm11,xmm13 - pand xmm1,xmm8 - pxor xmm7,xmm11 - pxor xmm1,xmm5 - pxor xmm11,xmm10 - pxor xmm5,xmm9 - pxor xmm4,xmm11 - pxor xmm1,xmm11 - pxor xmm0,xmm7 - pxor xmm5,xmm7 - - movdqa xmm11,xmm2 - movdqa xmm7,xmm6 - pxor xmm11,xmm15 - pxor xmm7,xmm3 - movdqa xmm10,xmm14 - movdqa xmm9,xmm12 - pxor xmm10,xmm13 - pxor xmm9,xmm8 - pand xmm10,xmm11 - pand xmm9,xmm15 - pxor xmm11,xmm7 - pxor xmm15,xmm3 - pand xmm7,xmm14 - pand xmm3,xmm12 - pand xmm11,xmm13 - pand xmm15,xmm8 - pxor xmm7,xmm11 - pxor xmm15,xmm3 - pxor xmm11,xmm10 - pxor xmm3,xmm9 - pxor xmm14,xmm12 - pxor xmm13,xmm8 - movdqa xmm10,xmm14 - pxor xmm10,xmm13 - pand xmm10,xmm2 - pxor xmm2,xmm6 - pand xmm6,xmm14 - pand xmm2,xmm13 - pxor xmm2,xmm6 - pxor xmm6,xmm10 - pxor xmm2,xmm11 - pxor xmm15,xmm11 - pxor xmm6,xmm7 - pxor xmm3,xmm7 - pxor xmm0,xmm6 - pxor xmm5,xmm4 - - pxor xmm3,xmm0 - pxor xmm1,xmm6 - pxor xmm4,xmm6 - pxor xmm3,xmm1 - pxor xmm6,xmm15 - pxor xmm3,xmm4 - pxor xmm2,xmm5 - pxor xmm5,xmm0 - pxor xmm2,xmm3 - - pxor xmm3,xmm15 - pxor xmm6,xmm2 - dec r10d - jl NEAR $L$dec_done - - pshufd xmm7,xmm15,0x4E - pshufd xmm13,xmm2,0x4E - pxor xmm7,xmm15 - pshufd xmm14,xmm4,0x4E - pxor xmm13,xmm2 - pshufd xmm8,xmm0,0x4E - pxor xmm14,xmm4 - pshufd xmm9,xmm5,0x4E - pxor xmm8,xmm0 - pshufd xmm10,xmm3,0x4E - pxor xmm9,xmm5 - pxor xmm15,xmm13 - pxor xmm0,xmm13 - pshufd xmm11,xmm1,0x4E - pxor xmm10,xmm3 - pxor xmm5,xmm7 - pxor xmm3,xmm8 - pshufd xmm12,xmm6,0x4E - pxor xmm11,xmm1 - pxor xmm0,xmm14 - pxor xmm1,xmm9 - pxor xmm12,xmm6 - - pxor xmm5,xmm14 - pxor xmm3,xmm13 - pxor xmm1,xmm13 - pxor xmm6,xmm10 - pxor xmm2,xmm11 - pxor xmm1,xmm14 - pxor xmm6,xmm14 - pxor xmm4,xmm12 - pshufd xmm7,xmm15,0x93 - pshufd xmm8,xmm0,0x93 - pxor xmm15,xmm7 - pshufd xmm9,xmm5,0x93 - pxor xmm0,xmm8 - pshufd xmm10,xmm3,0x93 - pxor xmm5,xmm9 - pshufd xmm11,xmm1,0x93 - pxor xmm3,xmm10 - pshufd xmm12,xmm6,0x93 - pxor xmm1,xmm11 - pshufd xmm13,xmm2,0x93 - pxor xmm6,xmm12 - pshufd xmm14,xmm4,0x93 - pxor xmm2,xmm13 - pxor xmm4,xmm14 - - pxor xmm8,xmm15 - pxor xmm7,xmm4 - pxor xmm8,xmm4 - pshufd xmm15,xmm15,0x4E - pxor xmm9,xmm0 - pshufd xmm0,xmm0,0x4E - pxor xmm12,xmm1 - pxor xmm15,xmm7 - pxor xmm13,xmm6 - pxor xmm0,xmm8 - pxor xmm11,xmm3 - pshufd xmm7,xmm1,0x4E - pxor xmm14,xmm2 - pshufd xmm8,xmm6,0x4E - pxor xmm10,xmm5 - pshufd xmm1,xmm3,0x4E - pxor xmm10,xmm4 - pshufd xmm6,xmm4,0x4E - pxor xmm11,xmm4 - pshufd xmm3,xmm2,0x4E - pxor xmm7,xmm11 - pshufd xmm2,xmm5,0x4E - pxor xmm8,xmm12 - pxor xmm10,xmm1 - pxor xmm6,xmm14 - pxor xmm13,xmm3 - movdqa xmm3,xmm7 - pxor xmm2,xmm9 - movdqa xmm5,xmm13 - movdqa xmm4,xmm8 - movdqa xmm1,xmm2 - movdqa xmm2,xmm10 - movdqa xmm7,XMMWORD[((-16))+r11] - jnz NEAR $L$dec_loop - movdqa xmm7,XMMWORD[((-32))+r11] - jmp NEAR $L$dec_loop -ALIGN 16 -$L$dec_done: - movdqa xmm7,XMMWORD[r11] - movdqa xmm8,XMMWORD[16+r11] - movdqa xmm9,xmm2 - psrlq xmm2,1 - movdqa xmm10,xmm1 - psrlq xmm1,1 - pxor xmm2,xmm4 - pxor xmm1,xmm6 - pand xmm2,xmm7 - pand xmm1,xmm7 - pxor xmm4,xmm2 - psllq xmm2,1 - pxor xmm6,xmm1 - psllq xmm1,1 - pxor xmm2,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm5 - psrlq xmm5,1 - movdqa xmm10,xmm15 - psrlq xmm15,1 - pxor xmm5,xmm3 - pxor xmm15,xmm0 - pand xmm5,xmm7 - pand xmm15,xmm7 - pxor xmm3,xmm5 - psllq xmm5,1 - pxor xmm0,xmm15 - psllq xmm15,1 - pxor xmm5,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[32+r11] - movdqa xmm9,xmm6 - psrlq xmm6,2 - movdqa xmm10,xmm1 - psrlq xmm1,2 - pxor xmm6,xmm4 - pxor xmm1,xmm2 - pand xmm6,xmm8 - pand xmm1,xmm8 - pxor xmm4,xmm6 - psllq xmm6,2 - pxor xmm2,xmm1 - psllq xmm1,2 - pxor xmm6,xmm9 - pxor xmm1,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,2 - movdqa xmm10,xmm15 - psrlq xmm15,2 - pxor xmm0,xmm3 - pxor xmm15,xmm5 - pand xmm0,xmm8 - pand xmm15,xmm8 - pxor xmm3,xmm0 - psllq xmm0,2 - pxor xmm5,xmm15 - psllq xmm15,2 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm9,xmm3 - psrlq xmm3,4 - movdqa xmm10,xmm5 - psrlq xmm5,4 - pxor xmm3,xmm4 - pxor xmm5,xmm2 - pand xmm3,xmm7 - pand xmm5,xmm7 - pxor xmm4,xmm3 - psllq xmm3,4 - pxor xmm2,xmm5 - psllq xmm5,4 - pxor xmm3,xmm9 - pxor xmm5,xmm10 - movdqa xmm9,xmm0 - psrlq xmm0,4 - movdqa xmm10,xmm15 - psrlq xmm15,4 - pxor xmm0,xmm6 - pxor xmm15,xmm1 - pand xmm0,xmm7 - pand xmm15,xmm7 - pxor xmm6,xmm0 - psllq xmm0,4 - pxor xmm1,xmm15 - psllq xmm15,4 - pxor xmm0,xmm9 - pxor xmm15,xmm10 - movdqa xmm7,XMMWORD[rax] - pxor xmm5,xmm7 - pxor xmm3,xmm7 - pxor xmm1,xmm7 - pxor xmm6,xmm7 - pxor xmm2,xmm7 - pxor xmm4,xmm7 - pxor xmm15,xmm7 - pxor xmm0,xmm7 - DB 0F3h,0C3h ;repret - - - -ALIGN 16 -_bsaes_key_convert: - - lea r11,[$L$masks] - movdqu xmm7,XMMWORD[rcx] - lea rcx,[16+rcx] - movdqa xmm0,XMMWORD[r11] - movdqa xmm1,XMMWORD[16+r11] - movdqa xmm2,XMMWORD[32+r11] - movdqa xmm3,XMMWORD[48+r11] - movdqa xmm4,XMMWORD[64+r11] - pcmpeqd xmm5,xmm5 - - movdqu xmm6,XMMWORD[rcx] - movdqa XMMWORD[rax],xmm7 - lea rax,[16+rax] - dec r10d - jmp NEAR $L$key_loop -ALIGN 16 -$L$key_loop: -DB 102,15,56,0,244 - - movdqa xmm8,xmm0 - movdqa xmm9,xmm1 - - pand xmm8,xmm6 - pand xmm9,xmm6 - movdqa xmm10,xmm2 - pcmpeqb xmm8,xmm0 - psllq xmm0,4 - movdqa xmm11,xmm3 - pcmpeqb xmm9,xmm1 - psllq xmm1,4 - - pand xmm10,xmm6 - pand xmm11,xmm6 - movdqa xmm12,xmm0 - pcmpeqb xmm10,xmm2 - psllq xmm2,4 - movdqa xmm13,xmm1 - pcmpeqb xmm11,xmm3 - psllq xmm3,4 - - movdqa xmm14,xmm2 - movdqa xmm15,xmm3 - pxor xmm8,xmm5 - pxor xmm9,xmm5 - - pand xmm12,xmm6 - pand xmm13,xmm6 - movdqa XMMWORD[rax],xmm8 - pcmpeqb xmm12,xmm0 - psrlq xmm0,4 - movdqa XMMWORD[16+rax],xmm9 - pcmpeqb xmm13,xmm1 - psrlq xmm1,4 - lea rcx,[16+rcx] - - pand xmm14,xmm6 - pand xmm15,xmm6 - movdqa XMMWORD[32+rax],xmm10 - pcmpeqb xmm14,xmm2 - psrlq xmm2,4 - movdqa XMMWORD[48+rax],xmm11 - pcmpeqb xmm15,xmm3 - psrlq xmm3,4 - movdqu xmm6,XMMWORD[rcx] - - pxor xmm13,xmm5 - pxor xmm14,xmm5 - movdqa XMMWORD[64+rax],xmm12 - movdqa XMMWORD[80+rax],xmm13 - movdqa XMMWORD[96+rax],xmm14 - movdqa XMMWORD[112+rax],xmm15 - lea rax,[128+rax] - dec r10d - jnz NEAR $L$key_loop - - movdqa xmm7,XMMWORD[80+r11] - - DB 0F3h,0C3h ;repret - - -global bsaes_cbc_encrypt - -ALIGN 16 -bsaes_cbc_encrypt: - - - - - mov rax,rsp -$L$cbc_dec_prologue: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-72))+rsp] - - mov r10,QWORD[160+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$cbc_dec_body: - mov rbp,rsp - - mov eax,DWORD[240+r9] - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - mov rbx,r10 - shr r14,4 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,XMMWORD[rsp] - movdqa XMMWORD[rax],xmm6 - movdqa XMMWORD[rsp],xmm7 - - movdqu xmm14,XMMWORD[rbx] - sub r14,8 - jc NEAR $L$cbc_dec_loop_done - -$L$cbc_dec_loop: - movdqu xmm15,XMMWORD[r12] - movdqu xmm0,XMMWORD[16+r12] - movdqu xmm1,XMMWORD[32+r12] - movdqu xmm2,XMMWORD[48+r12] - movdqu xmm3,XMMWORD[64+r12] - movdqu xmm4,XMMWORD[80+r12] - mov rax,rsp - movdqu xmm5,XMMWORD[96+r12] - mov r10d,edx - movdqu xmm6,XMMWORD[112+r12] - movdqa XMMWORD[32+rbp],xmm14 - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm2,xmm12 - movdqu xmm14,XMMWORD[112+r12] - pxor xmm4,xmm13 - movdqu XMMWORD[r13],xmm15 - lea r12,[128+r12] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - sub r14,8 - jnc NEAR $L$cbc_dec_loop - -$L$cbc_dec_loop_done: - add r14,8 - jz NEAR $L$cbc_dec_done - - movdqu xmm15,XMMWORD[r12] - mov rax,rsp - mov r10d,edx - cmp r14,2 - jb NEAR $L$cbc_dec_one - movdqu xmm0,XMMWORD[16+r12] - je NEAR $L$cbc_dec_two - movdqu xmm1,XMMWORD[32+r12] - cmp r14,4 - jb NEAR $L$cbc_dec_three - movdqu xmm2,XMMWORD[48+r12] - je NEAR $L$cbc_dec_four - movdqu xmm3,XMMWORD[64+r12] - cmp r14,6 - jb NEAR $L$cbc_dec_five - movdqu xmm4,XMMWORD[80+r12] - je NEAR $L$cbc_dec_six - movdqu xmm5,XMMWORD[96+r12] - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu xmm14,XMMWORD[96+r12] - pxor xmm2,xmm12 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_six: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu xmm14,XMMWORD[80+r12] - pxor xmm6,xmm11 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_five: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu xmm14,XMMWORD[64+r12] - pxor xmm1,xmm10 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_four: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu xmm14,XMMWORD[48+r12] - pxor xmm3,xmm9 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_three: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu xmm14,XMMWORD[32+r12] - pxor xmm5,xmm8 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_two: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm7,XMMWORD[r12] - movdqu xmm14,XMMWORD[16+r12] - pxor xmm0,xmm7 - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - jmp NEAR $L$cbc_dec_done -ALIGN 16 -$L$cbc_dec_one: - movdqa XMMWORD[32+rbp],xmm14 - call _bsaes_decrypt8 - pxor xmm15,XMMWORD[32+rbp] - movdqu xmm14,XMMWORD[r12] - movdqu XMMWORD[r13],xmm15 - jmp NEAR $L$cbc_dec_done - -$L$cbc_dec_done: - movdqu XMMWORD[rbx],xmm14 - lea rax,[rsp] - pxor xmm0,xmm0 -$L$cbc_dec_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$cbc_dec_bzero - - lea rax,[120+rbp] - - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$cbc_dec_tail: - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbx,QWORD[((-16))+rax] - - mov rbp,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$cbc_dec_epilogue: - DB 0F3h,0C3h ;repret - - - -global bsaes_ctr32_encrypt_blocks - -ALIGN 16 -bsaes_ctr32_encrypt_blocks: - -%ifndef NDEBUG -%ifndef BORINGSSL_FIPS -EXTERN BORINGSSL_function_hit - mov BYTE[((BORINGSSL_function_hit+6))],1 -%endif -%endif - mov rax,rsp -$L$ctr_enc_prologue: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-72))+rsp] - - mov r10,QWORD[160+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$ctr_enc_body: - mov rbp,rsp - - movdqu xmm0,XMMWORD[r10] - mov eax,DWORD[240+r9] - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - movdqa XMMWORD[32+rbp],xmm0 - - - - mov ebx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,ebx - call _bsaes_key_convert - pxor xmm7,xmm6 - movdqa XMMWORD[rax],xmm7 - - movdqa xmm8,XMMWORD[rsp] - lea r11,[$L$ADD1] - movdqa xmm15,XMMWORD[32+rbp] - movdqa xmm7,XMMWORD[((-32))+r11] -DB 102,68,15,56,0,199 -DB 102,68,15,56,0,255 - movdqa XMMWORD[rsp],xmm8 - jmp NEAR $L$ctr_enc_loop -ALIGN 16 -$L$ctr_enc_loop: - movdqa XMMWORD[32+rbp],xmm15 - movdqa xmm0,xmm15 - movdqa xmm1,xmm15 - paddd xmm0,XMMWORD[r11] - movdqa xmm2,xmm15 - paddd xmm1,XMMWORD[16+r11] - movdqa xmm3,xmm15 - paddd xmm2,XMMWORD[32+r11] - movdqa xmm4,xmm15 - paddd xmm3,XMMWORD[48+r11] - movdqa xmm5,xmm15 - paddd xmm4,XMMWORD[64+r11] - movdqa xmm6,xmm15 - paddd xmm5,XMMWORD[80+r11] - paddd xmm6,XMMWORD[96+r11] - - - - movdqa xmm8,XMMWORD[rsp] - lea rax,[16+rsp] - movdqa xmm7,XMMWORD[((-16))+r11] - pxor xmm15,xmm8 - pxor xmm0,xmm8 - pxor xmm1,xmm8 - pxor xmm2,xmm8 -DB 102,68,15,56,0,255 -DB 102,15,56,0,199 - pxor xmm3,xmm8 - pxor xmm4,xmm8 -DB 102,15,56,0,207 -DB 102,15,56,0,215 - pxor xmm5,xmm8 - pxor xmm6,xmm8 -DB 102,15,56,0,223 -DB 102,15,56,0,231 -DB 102,15,56,0,239 -DB 102,15,56,0,247 - lea r11,[$L$BS0] - mov r10d,ebx - - call _bsaes_encrypt8_bitslice - - sub r14,8 - jc NEAR $L$ctr_enc_loop_done - - movdqu xmm7,XMMWORD[r12] - movdqu xmm8,XMMWORD[16+r12] - movdqu xmm9,XMMWORD[32+r12] - movdqu xmm10,XMMWORD[48+r12] - movdqu xmm11,XMMWORD[64+r12] - movdqu xmm12,XMMWORD[80+r12] - movdqu xmm13,XMMWORD[96+r12] - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - pxor xmm7,xmm15 - movdqa xmm15,XMMWORD[32+rbp] - pxor xmm0,xmm8 - movdqu XMMWORD[r13],xmm7 - pxor xmm3,xmm9 - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,xmm10 - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,xmm11 - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,xmm12 - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,xmm13 - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,xmm14 - movdqu XMMWORD[96+r13],xmm1 - lea r11,[$L$ADD1] - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - paddd xmm15,XMMWORD[112+r11] - jnz NEAR $L$ctr_enc_loop - - jmp NEAR $L$ctr_enc_done -ALIGN 16 -$L$ctr_enc_loop_done: - add r14,8 - movdqu xmm7,XMMWORD[r12] - pxor xmm15,xmm7 - movdqu XMMWORD[r13],xmm15 - cmp r14,2 - jb NEAR $L$ctr_enc_done - movdqu xmm8,XMMWORD[16+r12] - pxor xmm0,xmm8 - movdqu XMMWORD[16+r13],xmm0 - je NEAR $L$ctr_enc_done - movdqu xmm9,XMMWORD[32+r12] - pxor xmm3,xmm9 - movdqu XMMWORD[32+r13],xmm3 - cmp r14,4 - jb NEAR $L$ctr_enc_done - movdqu xmm10,XMMWORD[48+r12] - pxor xmm5,xmm10 - movdqu XMMWORD[48+r13],xmm5 - je NEAR $L$ctr_enc_done - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm11 - movdqu XMMWORD[64+r13],xmm2 - cmp r14,6 - jb NEAR $L$ctr_enc_done - movdqu xmm12,XMMWORD[80+r12] - pxor xmm6,xmm12 - movdqu XMMWORD[80+r13],xmm6 - je NEAR $L$ctr_enc_done - movdqu xmm13,XMMWORD[96+r12] - pxor xmm1,xmm13 - movdqu XMMWORD[96+r13],xmm1 - - - -$L$ctr_enc_done: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$ctr_enc_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$ctr_enc_bzero - - lea rax,[120+rbp] - - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$ctr_enc_tail: - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbx,QWORD[((-16))+rax] - - mov rbp,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$ctr_enc_epilogue: - DB 0F3h,0C3h ;repret - - - -ALIGN 64 -_bsaes_const: -$L$M0ISR: - DQ 0x0a0e0206070b0f03,0x0004080c0d010509 -$L$ISRM0: - DQ 0x01040b0e0205080f,0x0306090c00070a0d -$L$ISR: - DQ 0x0504070602010003,0x0f0e0d0c080b0a09 -$L$BS0: - DQ 0x5555555555555555,0x5555555555555555 -$L$BS1: - DQ 0x3333333333333333,0x3333333333333333 -$L$BS2: - DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f -$L$SR: - DQ 0x0504070600030201,0x0f0e0d0c0a09080b -$L$SRM0: - DQ 0x0304090e00050a0f,0x01060b0c0207080d -$L$M0SR: - DQ 0x0a0e02060f03070b,0x0004080c05090d01 -$L$SWPUP: - DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 -$L$SWPUPM0SR: - DQ 0x0a0d02060c03070b,0x0004080f05090e01 -$L$ADD1: - DQ 0x0000000000000000,0x0000000100000000 -$L$ADD2: - DQ 0x0000000000000000,0x0000000200000000 -$L$ADD3: - DQ 0x0000000000000000,0x0000000300000000 -$L$ADD4: - DQ 0x0000000000000000,0x0000000400000000 -$L$ADD5: - DQ 0x0000000000000000,0x0000000500000000 -$L$ADD6: - DQ 0x0000000000000000,0x0000000600000000 -$L$ADD7: - DQ 0x0000000000000000,0x0000000700000000 -$L$ADD8: - DQ 0x0000000000000000,0x0000000800000000 -$L$xts_magic: - DD 0x87,0,1,0 -$L$masks: - DQ 0x0101010101010101,0x0101010101010101 - DQ 0x0202020202020202,0x0202020202020202 - DQ 0x0404040404040404,0x0404040404040404 - DQ 0x0808080808080808,0x0808080808080808 -$L$M0: - DQ 0x02060a0e03070b0f,0x0004080c0105090d -$L$63: - DQ 0x6363636363636363,0x6363636363636363 -DB 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102 -DB 111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44 -DB 32,69,109,105,108,105,97,32,75,195,164,115,112,101,114,44 -DB 32,80,101,116,101,114,32,83,99,104,119,97,98,101,44,32 -DB 65,110,100,121,32,80,111,108,121,97,107,111,118,0 -ALIGN 64 - -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jbe NEAR $L$in_prologue - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov r10d,DWORD[8+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$in_tail - - mov rax,QWORD[160+r8] - - lea rsi,[64+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - lea rax,[((160+120))+rax] - -$L$in_tail: - mov rbp,QWORD[((-48))+rax] - mov rbx,QWORD[((-40))+rax] - mov r12,QWORD[((-32))+rax] - mov r13,QWORD[((-24))+rax] - mov r14,QWORD[((-16))+rax] - mov r15,QWORD[((-8))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$in_prologue: - mov QWORD[152+r8],rax - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$cbc_dec_prologue wrt ..imagebase - DD $L$cbc_dec_epilogue wrt ..imagebase - DD $L$cbc_dec_info wrt ..imagebase - - DD $L$ctr_enc_prologue wrt ..imagebase - DD $L$ctr_enc_epilogue wrt ..imagebase - DD $L$ctr_enc_info wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$cbc_dec_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase - DD $L$cbc_dec_tail wrt ..imagebase - DD 0 -$L$ctr_enc_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase - DD $L$ctr_enc_tail wrt ..imagebase - DD 0 diff --git a/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm index 593d166c..10092c8a 100644 --- a/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm +++ b/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm @@ -120,6 +120,181 @@ DB 102,15,56,0,193 + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core_2x: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + movdqa xmm8,xmm2 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + movdqu xmm5,XMMWORD[r9] + + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,208 +DB 102,68,15,56,0,198 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + movdqa xmm6,xmm0 +DB 102,15,56,0,193 +DB 102,15,56,0,247 + pxor xmm2,xmm5 + pxor xmm8,xmm5 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc2x_entry + +ALIGN 16 +$L$enc2x_loop: + + movdqa xmm4,XMMWORD[$L$k_sb1] + movdqa xmm0,XMMWORD[(($L$k_sb1+16))] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + movdqa xmm5,XMMWORD[$L$k_sb2] + movdqa xmm13,xmm5 + pxor xmm0,xmm4 + pxor xmm6,xmm12 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + +DB 102,15,56,0,234 +DB 102,69,15,56,0,232 + movdqa xmm4,XMMWORD[r10*1+r11] + + movdqa xmm2,XMMWORD[(($L$k_sb2+16))] + movdqa xmm8,xmm2 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm0 + movdqa xmm11,xmm6 + pxor xmm2,xmm5 + pxor xmm8,xmm13 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 +DB 102,15,56,0,220 +DB 102,68,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 + pxor xmm11,xmm6 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + pxor xmm6,xmm11 + +$L$enc2x_entry: + + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm5,XMMWORD[(($L$k_inv+16))] + movdqa xmm13,xmm5 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,232 +DB 102,68,15,56,0,238 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm0,xmm1 + pxor xmm6,xmm7 +DB 102,15,56,0,217 +DB 102,68,15,56,0,223 + movdqa xmm4,xmm10 + movdqa xmm12,xmm10 + pxor xmm3,xmm5 + pxor xmm11,xmm13 +DB 102,15,56,0,224 +DB 102,68,15,56,0,230 + movdqa xmm2,xmm10 + movdqa xmm8,xmm10 + pxor xmm4,xmm5 + pxor xmm12,xmm13 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm2,xmm0 + pxor xmm8,xmm6 +DB 102,15,56,0,220 +DB 102,69,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + + pxor xmm3,xmm1 + pxor xmm11,xmm7 + jnz NEAR $L$enc2x_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 + pxor xmm4,xmm5 + pxor xmm12,xmm5 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + movdqa xmm1,XMMWORD[64+r10*1+r11] + + pxor xmm0,xmm4 + pxor xmm6,xmm12 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + DB 0F3h,0C3h ;repret + + + + + + + + + ALIGN 16 _vpaes_decrypt_core: @@ -929,6 +1104,105 @@ $L$cbc_abort: DB 0F3h,0C3h ;repret $L$SEH_end_vpaes_cbc_encrypt: +global vpaes_ctr32_encrypt_blocks + +ALIGN 16 +vpaes_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + + xchg rdx,rcx + test rcx,rcx + jz NEAR $L$ctr32_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$ctr32_body: + movdqu xmm0,XMMWORD[r8] + movdqa xmm8,XMMWORD[$L$ctr_add_one] + sub rsi,rdi + call _vpaes_preheat + movdqa xmm6,xmm0 + pshufb xmm6,XMMWORD[$L$rev_ctr] + + test rcx,1 + jz NEAR $L$ctr32_prep_loop + + + + movdqu xmm7,XMMWORD[rdi] + call _vpaes_encrypt_core + pxor xmm0,xmm7 + paddd xmm6,xmm8 + movdqu XMMWORD[rdi*1+rsi],xmm0 + sub rcx,1 + lea rdi,[16+rdi] + jz NEAR $L$ctr32_done + +$L$ctr32_prep_loop: + + + movdqa xmm14,xmm6 + movdqa xmm15,xmm6 + paddd xmm15,xmm8 + +$L$ctr32_loop: + movdqa xmm1,XMMWORD[$L$rev_ctr] + movdqa xmm0,xmm14 + movdqa xmm6,xmm15 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm3,XMMWORD[$L$ctr_add_two] + pxor xmm0,xmm1 + pxor xmm6,xmm2 + paddd xmm14,xmm3 + paddd xmm15,xmm3 + movdqu XMMWORD[rdi*1+rsi],xmm0 + movdqu XMMWORD[16+rdi*1+rsi],xmm6 + sub rcx,2 + lea rdi,[32+rdi] + jnz NEAR $L$ctr32_loop + +$L$ctr32_done: + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$ctr32_epilogue: +$L$ctr32_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_vpaes_ctr32_encrypt_blocks: @@ -1051,6 +1325,17 @@ $L$k_dsbe: $L$k_dsbo: DQ 0x1387EA537EF94000,0xC7AA6DB9D4943E2D DQ 0x12D7560F93441D00,0xCA4B8159D8C58E9C + + +$L$rev_ctr: + DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 + + +$L$ctr_add_one: + DQ 0x0000000000000000,0x0000000100000000 +$L$ctr_add_two: + DQ 0x0000000000000000,0x0000000200000000 + DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 @@ -1159,6 +1444,10 @@ ALIGN 4 DD $L$SEH_end_vpaes_cbc_encrypt wrt ..imagebase DD $L$SEH_info_vpaes_cbc_encrypt wrt ..imagebase + DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase + section .xdata rdata align=8 ALIGN 8 $L$SEH_info_vpaes_set_encrypt_key: @@ -1181,3 +1470,7 @@ $L$SEH_info_vpaes_cbc_encrypt: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$cbc_body wrt ..imagebase,$L$cbc_epilogue wrt ..imagebase +$L$SEH_info_vpaes_ctr32_encrypt_blocks: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase |