diff options
author | Robert Sloan <varomodt@google.com> | 2019-03-12 14:24:00 -0700 |
---|---|---|
committer | Robert Sloan <varomodt@google.com> | 2019-03-12 14:25:56 -0700 |
commit | 8967815a10ba914661a880832dbd2eb854f2ac11 (patch) | |
tree | 33149c81966ec37580b1de90968b0c94343148e4 /src/crypto | |
parent | 4c22c5fad19b2a554bcb056ca25ca4cc2ef6a45c (diff) | |
download | boringssl-8967815a10ba914661a880832dbd2eb854f2ac11.tar.gz |
external/boringssl: Sync to 35941f2923155664bd9fa5d897cb336a0ab729a1.
This includes the following changes:
https://boringssl.googlesource.com/boringssl/+log/c3889634a1aa52575c5d26497696238208fbd0f5..35941f2923155664bd9fa5d897cb336a0ab729a1
Test: atest CtsLibcoreTestCases
Change-Id: I7c3352e0ce7c9e236d75c1b0beb580012db2d14d
Diffstat (limited to 'src/crypto')
-rw-r--r-- | src/crypto/cipher_extra/e_aesccm.c | 2 | ||||
-rw-r--r-- | src/crypto/cipher_extra/e_aesgcmsiv.c | 4 | ||||
-rw-r--r-- | src/crypto/fipsmodule/CMakeLists.txt | 4 | ||||
-rw-r--r-- | src/crypto/fipsmodule/aes/aes_test.cc | 3 | ||||
-rwxr-xr-x | src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl | 1362 | ||||
-rw-r--r-- | src/crypto/fipsmodule/aes/internal.h | 16 | ||||
-rw-r--r-- | src/crypto/fipsmodule/cipher/e_aes.c | 14 | ||||
-rw-r--r-- | src/crypto/fipsmodule/ec/ec_key.c | 2 | ||||
-rw-r--r-- | src/crypto/fipsmodule/ec/ec_test.cc | 14 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/asm/ghash-armv4.pl | 9 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl | 288 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/gcm.c | 10 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/gcm_test.cc | 4 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/internal.h | 14 | ||||
-rw-r--r-- | src/crypto/mem.c | 16 |
15 files changed, 1736 insertions, 26 deletions
diff --git a/src/crypto/cipher_extra/e_aesccm.c b/src/crypto/cipher_extra/e_aesccm.c index 144a9093..4e6668c0 100644 --- a/src/crypto/cipher_extra/e_aesccm.c +++ b/src/crypto/cipher_extra/e_aesccm.c @@ -67,7 +67,7 @@ static int aead_aes_ccm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, block128_f block; ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len, - 1 /* large inputs */); + 0 /* small inputs */); ctx->tag_len = tag_len; if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, M, L)) { OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR); diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c index 0e5063cd..fb08a428 100644 --- a/src/crypto/cipher_extra/e_aesgcmsiv.c +++ b/src/crypto/cipher_extra/e_aesgcmsiv.c @@ -595,7 +595,7 @@ static int aead_aes_gcm_siv_init(EVP_AEAD_CTX *ctx, const uint8_t *key, OPENSSL_memset(gcm_siv_ctx, 0, sizeof(struct aead_aes_gcm_siv_ctx)); aes_ctr_set_key(&gcm_siv_ctx->ks.ks, NULL, &gcm_siv_ctx->kgk_block, key, - key_len, 1 /* large inputs */); + key_len, 0 /* small inputs */); gcm_siv_ctx->is_256 = (key_len == 32); ctx->tag_len = tag_len; @@ -720,7 +720,7 @@ static void gcm_siv_keys( OPENSSL_memcpy(out_keys->auth_key, key_material, 16); aes_ctr_set_key(&out_keys->enc_key.ks, NULL, &out_keys->enc_block, key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16, - 1 /* large inputs */); + 0 /* small inputs */); } static int aead_aes_gcm_siv_seal_scatter( diff --git a/src/crypto/fipsmodule/CMakeLists.txt b/src/crypto/fipsmodule/CMakeLists.txt index b459263c..09d210bf 100644 --- a/src/crypto/fipsmodule/CMakeLists.txt +++ b/src/crypto/fipsmodule/CMakeLists.txt @@ -32,6 +32,7 @@ if(${ARCH} STREQUAL "x86") aesni-x86.${ASM_EXT} bn-586.${ASM_EXT} co-586.${ASM_EXT} + ghash-ssse3-x86.${ASM_EXT} ghash-x86.${ASM_EXT} md5-586.${ASM_EXT} sha1-586.${ASM_EXT} @@ -68,6 +69,7 @@ if(${ARCH} STREQUAL "aarch64") sha1-armv8.${ASM_EXT} sha256-armv8.${ASM_EXT} sha512-armv8.${ASM_EXT} + vpaes-armv8.${ASM_EXT} ) endif() @@ -98,6 +100,7 @@ perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl) perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl) perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl) perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl) +perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl) perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl) perlasm(ghash-x86.${ASM_EXT} modes/asm/ghash-x86.pl) perlasm(md5-586.${ASM_EXT} md5/asm/md5-586.pl) @@ -118,6 +121,7 @@ perlasm(sha512-586.${ASM_EXT} sha/asm/sha512-586.pl) perlasm(sha512-armv4.${ASM_EXT} sha/asm/sha512-armv4.pl) perlasm(sha512-armv8.${ASM_EXT} sha/asm/sha512-armv8.pl) perlasm(sha512-x86_64.${ASM_EXT} sha/asm/sha512-x86_64.pl) +perlasm(vpaes-armv8.${ASM_EXT} aes/asm/vpaes-armv8.pl) perlasm(vpaes-x86_64.${ASM_EXT} aes/asm/vpaes-x86_64.pl) perlasm(vpaes-x86.${ASM_EXT} aes/asm/vpaes-x86.pl) perlasm(x86_64-mont5.${ASM_EXT} bn/asm/x86_64-mont5.pl) diff --git a/src/crypto/fipsmodule/aes/aes_test.cc b/src/crypto/fipsmodule/aes/aes_test.cc index a0c94114..2222b63d 100644 --- a/src/crypto/fipsmodule/aes/aes_test.cc +++ b/src/crypto/fipsmodule/aes/aes_test.cc @@ -250,6 +250,9 @@ TEST(AESTest, ABI) { SCOPED_TRACE(blocks); CHECK_ABI(vpaes_cbc_encrypt, buf, buf, AES_BLOCK_SIZE * blocks, &key, block, AES_ENCRYPT); +#if defined(VPAES_CTR32) + CHECK_ABI(vpaes_ctr32_encrypt_blocks, buf, buf, blocks, &key, block); +#endif } CHECK_ABI(vpaes_set_decrypt_key, kKey, bits, &key); diff --git a/src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl new file mode 100755 index 00000000..5fa06d83 --- /dev/null +++ b/src/crypto/fipsmodule/aes/asm/vpaes-armv8.pl @@ -0,0 +1,1362 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +## +###################################################################### +# ARMv8 NEON adaptation by <appro@openssl.org> +# +# Reason for undertaken effort is that there is at least one popular +# SoC based on Cortex-A53 that doesn't have crypto extensions. +# +# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] +# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] +# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] +# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] +# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] +# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] +# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] +# +# (*) ECB denotes approximate result for parallelizable modes +# such as CBC decrypt, CTR, etc.; +# (**) these results are worse than scalar compiler-generated +# code, but it's constant-time and therefore preferred; +# (***) presented for reference/comparison purposes; + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$code.=<<___; +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:// mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +.Lk_dipt: // decryption input transform + .quad 0x0F505B040B545F00, 0x154A411E114E451A + .quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo: // decryption sbox final output + .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D + .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9: // decryption sbox output *9*u, *9*t + .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 + .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: // decryption sbox output *D*u, *D*t + .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 + .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: // decryption sbox output *B*u, *B*t + .quad 0xD022649296B44200, 0x602646F6B0F2D404 + .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: // decryption sbox output *E*u, *E*t + .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 + .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +___ + +{ +my ($inp,$out,$key) = map("x$_",(0..2)); + +my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); +my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); + +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adrp x11, :pg_hi21:.Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [$inp] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_encrypt,.-vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adrp x11, :pg_hi21:.Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {$iptlo}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {$ipthi}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {$sb1t}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {$sb1u}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {$sb2t}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {$sb2u}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {$invhi},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {$invlo},v8.16b + tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {$invlo},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {$invlo},v11.16b + tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {$invlo},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {$sbou}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {$sbot}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x + +.type _vpaes_decrypt_preheat,%function +.align 4 +_vpaes_decrypt_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + adrp x11, :pg_hi21:.Lk_dipt + add x11, x11, :lo12:.Lk_dipt + ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + ret +.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 + eor x11, x11, #0x30 // xor \$0x30, %r11 + adrp x10, :pg_hi21:.Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and \$0x30, %r11 + add x11, x11, x10 + adrp x10, :pg_hi21:.Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Ldec_entry + +.align 4 +.Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub \$1,%rax # nr-- + +.Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [$inp] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 + ret +.size vpaes_decrypt,.-vpaes_decrypt + +// v14-v15 input, v0-v1 output +.type _vpaes_decrypt_2x,%function +.align 4 +_vpaes_decrypt_2x: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 + eor x11, x11, #0x30 // xor \$0x30, %r11 + adrp x10, :pg_hi21:.Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and \$0x30, %r11 + add x11, x11, x10 + adrp x10, :pg_hi21:.Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {$iptlo},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {$ipthi},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Ldec_2x_entry + +.align 4 +.Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {$sb9u}, v10.16b + tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {$sb9t}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {$sbdu}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {$sbdt}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {$sbbu}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {$sbbt}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {$sbeu}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {$sbet}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub \$1,%rax # nr-- + +.Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {$invhi},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {$invlo},v8.16b + tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {$invlo},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {$invlo},v11.16b + tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {$invlo},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {$sbou}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {$sbot}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret +.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x +___ +} +{ +my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); +my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); + +$code.=<<___; +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, :pg_hi21:.Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, :pg_hi21:.Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 + adrp x11, :pg_hi21:.Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + cbnz $dir, .Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor \$0x30, %r8 + +.Lschedule_go: + cmp $bits, #192 // cmp \$192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov $inp, #10 // mov \$10, %esi + +.Loop_schedule_128: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub $inp, $inp, #8 + ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov $inp, #4 // mov \$4, %esi + +.Loop_schedule_192: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov $inp, #7 // mov \$7, %esi + +.Loop_schedule_256: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz $dir, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add $out, $out, #32 // add \$32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d-v21.2d}, [x11] // reload constants + sub $out, $out, #16 // add \$-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 + ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz $dir, .Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add $out, $out, #16 // add \$16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub $out, $out, #16 // add \$-16, %rdx + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add \$-16, %r8 + and x8, x8, #~(1<<6) // and \$0x30, %r8 + st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, $bits, #5 // shr \$5,%eax + add w9, w9, #5 // \$5,%eax + str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov $dir, #0 // mov \$0,%ecx + mov x8, #0x30 // mov \$0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, $bits, #5 // shr \$5,%eax + add w9, w9, #5 // \$5,%eax + str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl \$4,%eax + add $out, $out, #16 // lea 16(%rdx,%rax),%rdx + add $out, $out, x9 + + mov $dir, #1 // mov \$1,%ecx + lsr w8, $bits, #1 // shr \$1,%r8d + and x8, x8, #32 // and \$32,%r8d + eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key +___ +} +{ +my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); + +$code.=<<___; +.globl vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,%function +.align 4 +vpaes_cbc_encrypt: + cbz $len, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, $len // reassign + mov x2, $key // reassign + + ld1 {v0.16b}, [$ivec] // load ivec + bl _vpaes_encrypt_preheat + b .Lcbc_enc_loop + +.align 4 +.Lcbc_enc_loop: + ld1 {v7.16b}, [$inp],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out],#16 // save output + subs x17, x17, #16 + b.hi .Lcbc_enc_loop + + st1 {v0.16b}, [$ivec] // write ivec + + ldp x29,x30,[sp],#16 +.Lcbc_abort: + ret +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + +.type vpaes_cbc_decrypt,%function +.align 4 +vpaes_cbc_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, $len // reassign + mov x2, $key // reassign + ld1 {v6.16b}, [$ivec] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lcbc_dec_loop2x + + ld1 {v7.16b}, [$inp], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [$out], #16 + subs x17, x17, #16 + b.ls .Lcbc_dec_done + +.align 4 +.Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [$inp], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #32 + b.hi .Lcbc_dec_loop2x + +.Lcbc_dec_done: + st1 {v6.16b}, [$ivec] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt +___ +# We omit vpaes_ecb_* in BoringSSL. They are unused. +if (0) { +$code.=<<___; +.globl vpaes_ecb_encrypt +.type vpaes_ecb_encrypt,%function +.align 4 +vpaes_ecb_encrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, $len + mov x2, $key + bl _vpaes_encrypt_preheat + tst x17, #16 + b.eq .Lecb_enc_loop + + ld1 {v7.16b}, [$inp],#16 + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out],#16 + subs x17, x17, #16 + b.ls .Lecb_enc_done + +.align 4 +.Lecb_enc_loop: + ld1 {v14.16b,v15.16b}, [$inp], #32 + bl _vpaes_encrypt_2x + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #32 + b.hi .Lecb_enc_loop + +.Lecb_enc_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt + +.globl vpaes_ecb_decrypt +.type vpaes_ecb_decrypt,%function +.align 4 +vpaes_ecb_decrypt: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, $len + mov x2, $key + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lecb_dec_loop + + ld1 {v7.16b}, [$inp],#16 + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out],#16 + subs x17, x17, #16 + b.ls .Lecb_dec_done + +.align 4 +.Lecb_dec_loop: + ld1 {v14.16b,v15.16b}, [$inp], #32 + bl _vpaes_decrypt_2x + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #32 + b.hi .Lecb_dec_loop + +.Lecb_dec_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt +___ +} + +my ($ctr, $ctr_tmp) = ("w6", "w7"); + +# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, +# const AES_KEY *key, const uint8_t ivec[16]); +$code.=<<___; +.globl vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz $len, .Lctr32_done + + // Note, unlike the other functions, $len here is measured in blocks, + // not bytes. + mov x17, $len + mov x2, $key + + // Load the IV and counter portion. + ldr $ctr, [$ivec, #12] + ld1 {v7.16b}, [$ivec] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev $ctr, $ctr // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [$inp], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [$out], #16 + subs x17, x17, #1 + // Update the counter. + add $ctr, $ctr, #1 + rev $ctr_tmp, $ctr + mov v7.s[3], $ctr_tmp + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add $ctr, $ctr, #1 + rev $ctr_tmp, $ctr + mov v15.s[3], $ctr_tmp + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #2 + // Update the counter. + add $ctr_tmp, $ctr, #1 + add $ctr, $ctr, #2 + rev $ctr_tmp, $ctr_tmp + mov v14.s[3], $ctr_tmp + rev $ctr_tmp, $ctr + mov v15.s[3], $ctr_tmp + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + ret +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +___ +} + +print $code; + +close STDOUT; diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h index 0df30d9f..a05abcbf 100644 --- a/src/crypto/fipsmodule/aes/internal.h +++ b/src/crypto/fipsmodule/aes/internal.h @@ -35,13 +35,13 @@ OPENSSL_INLINE int hwaes_capable(void) { } #define VPAES -OPENSSL_INLINE char vpaes_capable(void) { +OPENSSL_INLINE int vpaes_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; } #if defined(OPENSSL_X86_64) #define BSAES -OPENSSL_INLINE char bsaes_capable(void) { return vpaes_capable(); } +OPENSSL_INLINE int bsaes_capable(void) { return vpaes_capable(); } #endif // X86_64 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) @@ -51,7 +51,13 @@ OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_ARMv8_AES_capable(); } #if defined(OPENSSL_ARM) #define BSAES -OPENSSL_INLINE char bsaes_capable(void) { return CRYPTO_is_NEON_capable(); } +OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); } +#endif + +#if defined(OPENSSL_AARCH64) +#define VPAES +#define VPAES_CTR32 +OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } #endif #elif defined(OPENSSL_PPC64LE) @@ -162,6 +168,10 @@ void vpaes_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, const AES_KEY *key, uint8_t *ivec, int enc); +#if defined(VPAES_CTR32) +void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, const uint8_t ivec[16]); +#endif #else OPENSSL_INLINE char vpaes_capable(void) { return 0; } diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c index 460deedd..51a1fb1c 100644 --- a/src/crypto/fipsmodule/cipher/e_aes.c +++ b/src/crypto/fipsmodule/cipher/e_aes.c @@ -143,7 +143,15 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, } else if (vpaes_capable()) { ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); dat->block = vpaes_encrypt; - dat->stream.cbc = mode == EVP_CIPH_CBC_MODE ? vpaes_cbc_encrypt : NULL; + dat->stream.cbc = NULL; + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = vpaes_cbc_encrypt; + } +#if defined(VPAES_CTR32) + if (mode == EVP_CIPH_CTR_MODE) { + dat->stream.ctr = vpaes_ctr32_encrypt_blocks; + } +#endif } else { ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); dat->block = aes_nohw_encrypt; @@ -253,7 +261,11 @@ ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, if (gcm_key != NULL) { CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0); } +#if defined(VPAES_CTR32) + return vpaes_ctr32_encrypt_blocks; +#else return NULL; +#endif } aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key); diff --git a/src/crypto/fipsmodule/ec/ec_key.c b/src/crypto/fipsmodule/ec/ec_key.c index 04650ed7..3ef17d99 100644 --- a/src/crypto/fipsmodule/ec/ec_key.c +++ b/src/crypto/fipsmodule/ec/ec_key.c @@ -267,7 +267,7 @@ int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub_key) { return 0; } - if (EC_GROUP_cmp(key->group, pub_key->group, NULL) != 0) { + if (pub_key != NULL && EC_GROUP_cmp(key->group, pub_key->group, NULL) != 0) { OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); return 0; } diff --git a/src/crypto/fipsmodule/ec/ec_test.cc b/src/crypto/fipsmodule/ec/ec_test.cc index 97c6d450..dd4c75ae 100644 --- a/src/crypto/fipsmodule/ec/ec_test.cc +++ b/src/crypto/fipsmodule/ec/ec_test.cc @@ -347,6 +347,20 @@ TEST(ECTest, SetKeyWithoutGroup) { EC_KEY_set_public_key(key.get(), EC_GROUP_get0_generator(group.get()))); } +TEST(ECTest, SetNULLKey) { + bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_X9_62_prime256v1)); + ASSERT_TRUE(key); + + EXPECT_TRUE(EC_KEY_set_public_key( + key.get(), EC_GROUP_get0_generator(EC_KEY_get0_group(key.get())))); + EXPECT_TRUE(EC_KEY_get0_public_key(key.get())); + + // Setting a NULL public-key should clear the public-key and return zero, in + // order to match OpenSSL behaviour exactly. + EXPECT_FALSE(EC_KEY_set_public_key(key.get(), nullptr)); + EXPECT_FALSE(EC_KEY_get0_public_key(key.get())); +} + TEST(ECTest, GroupMismatch) { bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_secp384r1)); ASSERT_TRUE(key); diff --git a/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl b/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl index e93e5b38..778b5436 100644 --- a/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl +++ b/src/crypto/fipsmodule/modes/asm/ghash-armv4.pl @@ -1,5 +1,5 @@ #! /usr/bin/env perl -# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy @@ -150,6 +150,8 @@ $code=<<___; .text #if defined(__thumb2__) || defined(__clang__) .syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne #endif #if defined(__thumb2__) .thumb @@ -157,11 +159,6 @@ $code=<<___; .code 32 #endif -#ifdef __clang__ -#define ldrplb ldrbpl -#define ldrneb ldrbne -#endif - .type rem_4bit,%object .align 5 rem_4bit: diff --git a/src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl b/src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl new file mode 100644 index 00000000..0d9ce156 --- /dev/null +++ b/src/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl @@ -0,0 +1,288 @@ +#!/usr/bin/env perl +# Copyright (c) 2019, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit +# table-based GHASH implementation. It requires SSSE3 instructions. +# +# For background, the table-based strategy is a 4-bit windowed multiplication. +# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops +# over 4-bit windows of the input and indexes them up into the table. Visually, +# it multiplies as in the schoolbook multiplication diagram below, but with +# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First +# it incorporates the terms labeled '1' by indexing the most significant term +# of X into the table. Then it shifts and repeats for '2' and so on. +# +# hhhhhh +# * xxxxxx +# ============ +# 666666 +# 555555 +# 444444 +# 333333 +# 222222 +# 111111 +# +# This implementation changes the order. We treat the table as a 16×16 matrix +# and transpose it. The first row is then the first byte of each multiple of H, +# and so on. We then reorder terms as below. Observe that the terms labeled '1' +# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 +# pshufb instruction, using alternating terms of X in parallel as indices. This +# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and +# repeat for each row. +# +# hhhhhh +# * xxxxxx +# ============ +# 224466 +# 113355 +# 224466 +# 113355 +# 224466 +# 113355 +# +# Next we account for GCM's confusing bit order. The "first" bit is the least +# significant coefficient, but GCM treats the most sigificant bit within a byte +# as first. Bytes are little-endian, and bits are big-endian. We reverse the +# bytes in XMM registers for a consistent bit and byte ordering, but this means +# the least significant bit is the most significant coefficient and vice versa. +# +# For consistency, "low", "high", "left-shift", and "right-shift" refer to the +# bit ordering within the XMM register, rather than the reversed coefficient +# ordering. Low bits are less significant bits and more significant +# coefficients. Right-shifts move from MSB to the LSB and correspond to +# increasing the power of each coefficient. +# +# Note this bit reversal enters into the table's column indices. H*1 is stored +# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier +# table rows contain more significant coefficients, so we iterate forwards. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../../perlasm"); +require "x86asm.pl"; + +$output = pop; +open STDOUT, ">$output"; + +&asm_init($ARGV[0]); + +my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx"); +&static_label("reverse_bytes"); +&static_label("low4_mask"); + +my $call_counter = 0; +# process_rows emits assembly code to process $rows rows of the table. On +# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the +# low and high halves of the input. The result so far is passed in xmm2. xmm3 +# must be zero. On output, $Htable is advanced to the next row and xmm2 is +# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6. +sub process_rows { + my ($rows) = @_; + $call_counter++; + + # Shifting whole XMM registers by bits is complex. psrldq shifts by + # bytes, and psrlq shifts the two 64-bit halves separately. Each row + # produces 8 bits of carry, and the reduction needs an additional 7-bit + # shift. This must fit in 64 bits so reduction can use psrlq. This + # allows up to 7 rows at a time. + die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); + + &mov("eax", $rows); +&set_label("loop_row_$call_counter"); + &movdqa("xmm4", &QWP(0, $Htable)); + &lea($Htable, &DWP(16, $Htable)); + + # Right-shift xmm2 and xmm3 by 8 bytes. + &movdqa("xmm6", "xmm2"); + &palignr("xmm6", "xmm3", 1); + &movdqa("xmm3", "xmm6"); + &psrldq("xmm2", 1); + + # Load the next table row and index the low and high bits of the input. + # Note the low (respectively, high) half corresponds to more + # (respectively, less) significant coefficients. + &movdqa("xmm5", "xmm4"); + &pshufb("xmm4", "xmm0"); + &pshufb("xmm5", "xmm1"); + + # Add the high half (xmm5) without shifting. + &pxor("xmm2", "xmm5"); + + # Add the low half (xmm4). This must be right-shifted by 4 bits. First, + # add into the carry register (xmm3). + &movdqa("xmm5", "xmm4"); + &psllq("xmm5", 60); + &movdqa("xmm6", "xmm5"); + &pslldq("xmm6", 8); + &pxor("xmm3", "xmm6"); + + # Next, add into xmm2. + &psrldq("xmm5", 8); + &pxor("xmm2", "xmm5"); + &psrlq("xmm4", 4); + &pxor("xmm2", "xmm4"); + + &sub("eax", 1); + &jnz(&label("loop_row_$call_counter")); + + # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + + # x^7, so we shift and XOR four times. + &pxor("xmm2", "xmm3"); # x^0 = 0 + &psrlq("xmm3", 1); + &pxor("xmm2", "xmm3"); # x^1 = x + &psrlq("xmm3", 1); + &pxor("xmm2", "xmm3"); # x^(1+1) = x^2 + &psrlq("xmm3", 5); + &pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7 + &pxor("xmm3", "xmm3"); +____ +} + +# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. +# |Xi| is represented in GHASH's serialized byte representation. |Htable| is +# formatted as described above. +# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); +&function_begin("gcm_gmult_ssse3"); + &mov($Xi, &wparam(0)); + &mov($Htable, &wparam(1)); + + &movdqu("xmm0", &QWP(0, $Xi)); + &call(&label("pic_point")); +&set_label("pic_point"); + &blindpop("eax"); + &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax")); + &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax")); + + # Reverse input bytes to deserialize. + &pshufb("xmm0", "xmm7"); + + # Split each byte into low (xmm0) and high (xmm1) halves. + &movdqa("xmm1", "xmm2"); + &pandn("xmm1", "xmm0"); + &psrld("xmm1", 4); + &pand("xmm0", "xmm2"); + + # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note + # that, due to bit reversal, xmm3 contains bits that fall off when + # right-shifting, not left-shifting. + &pxor("xmm2", "xmm2"); + &pxor("xmm3", "xmm3"); + + # We must reduce at least once every 7 rows, so divide into three + # chunks. + &process_rows(5); + &process_rows(5); + &process_rows(6); + + # Store the result. Reverse bytes to serialize. + &pshufb("xmm2", "xmm7"); + &movdqu(&QWP(0, $Xi), "xmm2"); + + # Zero any registers which contain secrets. + &pxor("xmm0", "xmm0"); + &pxor("xmm1", "xmm1"); + &pxor("xmm2", "xmm2"); + &pxor("xmm3", "xmm3"); + &pxor("xmm4", "xmm4"); + &pxor("xmm5", "xmm5"); + &pxor("xmm6", "xmm6"); +&function_end("gcm_gmult_ssse3"); + +# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as +# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's +# serialized byte representation. |Htable| is formatted as described above. +# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, +# size_t len); +&function_begin("gcm_ghash_ssse3"); + &mov($Xi, &wparam(0)); + &mov($Htable, &wparam(1)); + &mov($in, &wparam(2)); + &mov($len, &wparam(3)); + + &movdqu("xmm0", &QWP(0, $Xi)); + &call(&label("pic_point")); +&set_label("pic_point"); + &blindpop("ebx"); + &movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx")); + + # This function only processes whole blocks. + &and($len, -16); + + # Reverse input bytes to deserialize. We maintain the running + # total in xmm0. + &pshufb("xmm0", "xmm7"); + + # Iterate over each block. On entry to each iteration, xmm3 is zero. + &pxor("xmm3", "xmm3"); +&set_label("loop_ghash"); + &movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx")); + + # Incorporate the next block of input. + &movdqu("xmm1", &QWP(0, $in)); + &pshufb("xmm1", "xmm7"); # Reverse bytes. + &pxor("xmm0", "xmm1"); + + # Split each byte into low (xmm0) and high (xmm1) halves. + &movdqa("xmm1", "xmm2"); + &pandn("xmm1", "xmm0"); + &psrld("xmm1", 4); + &pand("xmm0", "xmm2"); + + # Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note + # that, due to bit reversal, xmm3 contains bits that fall off when + # right-shifting, not left-shifting. + &pxor("xmm2", "xmm2"); + # xmm3 is already zero at this point. + + # We must reduce at least once every 7 rows, so divide into three + # chunks. + &process_rows(5); + &process_rows(5); + &process_rows(6); + + &movdqa("xmm0", "xmm2"); + + # Rewind $Htable for the next iteration. + &lea($Htable, &DWP(-256, $Htable)); + + # Advance input and continue. + &lea($in, &DWP(16, $in)); + &sub($len, 16); + &jnz(&label("loop_ghash")); + + # Reverse bytes and store the result. + &pshufb("xmm0", "xmm7"); + &movdqu(&QWP(0, $Xi), "xmm0"); + + # Zero any registers which contain secrets. + &pxor("xmm0", "xmm0"); + &pxor("xmm1", "xmm1"); + &pxor("xmm2", "xmm2"); + &pxor("xmm3", "xmm3"); + &pxor("xmm4", "xmm4"); + &pxor("xmm5", "xmm5"); + &pxor("xmm6", "xmm6"); +&function_end("gcm_ghash_ssse3"); + +# reverse_bytes is a permutation which, if applied with pshufb, reverses the +# bytes in an XMM register. +&set_label("reverse_bytes", 16); +&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +# low4_mask is an XMM mask which selects the low four bits of each byte. +&set_label("low4_mask", 16); +&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); + +&asm_finish(); + +close STDOUT; diff --git a/src/crypto/fipsmodule/modes/gcm.c b/src/crypto/fipsmodule/modes/gcm.c index f92f6750..ca077acc 100644 --- a/src/crypto/fipsmodule/modes/gcm.c +++ b/src/crypto/fipsmodule/modes/gcm.c @@ -241,7 +241,7 @@ void gcm_ghash_4bit(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, // still in L1 cache after encryption pass... #define GHASH_CHUNK (3 * 1024) -#if defined(GHASH_ASM_X86_64) +#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86) void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) { // Run the existing 4-bit version. gcm_init_4bit(Htable, Xi); @@ -266,7 +266,7 @@ void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]) { } } } -#endif // GHASH_ASM_X86_64 +#endif // GHASH_ASM_X86_64 || GHASH_ASM_X86 #ifdef GCM_FUNCREF_4BIT #undef GCM_MUL @@ -321,6 +321,12 @@ void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, *out_hash = gcm_ghash_clmul; return; } + if (gcm_ssse3_capable()) { + gcm_init_ssse3(out_table, H.u); + *out_mult = gcm_gmult_ssse3; + *out_hash = gcm_ghash_ssse3; + return; + } #elif defined(GHASH_ASM_ARM) if (gcm_pmull_capable()) { gcm_init_v8(out_table, H.u); diff --git a/src/crypto/fipsmodule/modes/gcm_test.cc b/src/crypto/fipsmodule/modes/gcm_test.cc index 47ecd29d..b2e805c1 100644 --- a/src/crypto/fipsmodule/modes/gcm_test.cc +++ b/src/crypto/fipsmodule/modes/gcm_test.cc @@ -148,7 +148,7 @@ TEST(GCMTest, ABI) { } #endif // GHASH_ASM_X86 -#if defined(GHASH_ASM_X86_64) +#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64) if (gcm_ssse3_capable()) { CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH); CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable); @@ -156,9 +156,7 @@ TEST(GCMTest, ABI) { CHECK_ABI_SEH(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks); } } -#endif // GHASH_ASM_X86_64 -#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64) if (crypto_gcm_clmul_enabled()) { CHECK_ABI_SEH(gcm_init_clmul, Htable, kH); CHECK_ABI_SEH(gcm_gmult_clmul, X, Htable); diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h index 5f9d035b..9a081ebd 100644 --- a/src/crypto/fipsmodule/modes/internal.h +++ b/src/crypto/fipsmodule/modes/internal.h @@ -282,13 +282,6 @@ void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len); -#if defined(OPENSSL_X86_64) -#define GHASH_ASM_X86_64 -void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, - size_t len); - OPENSSL_INLINE char gcm_ssse3_capable(void) { return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; } @@ -300,6 +293,13 @@ void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, size_t len); +#if defined(OPENSSL_X86_64) +#define GHASH_ASM_X86_64 +void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); +void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, + size_t len); + #define AESNI_GCM size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi); diff --git a/src/crypto/mem.c b/src/crypto/mem.c index a06061b4..14e0bdf3 100644 --- a/src/crypto/mem.c +++ b/src/crypto/mem.c @@ -71,6 +71,14 @@ OPENSSL_MSVC_PRAGMA(warning(pop)) #define OPENSSL_MALLOC_PREFIX 8 +#if defined(OPENSSL_ASAN) +void __asan_poison_memory_region(const volatile void *addr, size_t size); +void __asan_unpoison_memory_region(const volatile void *addr, size_t size); +#else +static void __asan_poison_memory_region(const void *addr, size_t size) {} +static void __asan_unpoison_memory_region(const void *addr, size_t size) {} +#endif + #if defined(__GNUC__) || defined(__clang__) // sdallocx is a sized |free| function. By passing the size (which we happen to // always know in BoringSSL), the malloc implementation can save work. We cannot @@ -99,6 +107,7 @@ void *OPENSSL_malloc(size_t size) { *(size_t *)ptr = size; + __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX; } @@ -108,6 +117,7 @@ void OPENSSL_free(void *orig_ptr) { } void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX; + __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); size_t size = *(size_t *)ptr; OPENSSL_cleanse(ptr, size + OPENSSL_MALLOC_PREFIX); @@ -120,7 +130,9 @@ void *OPENSSL_realloc(void *orig_ptr, size_t new_size) { } void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX; + __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); size_t old_size = *(size_t *)ptr; + __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); void *ret = OPENSSL_malloc(new_size); if (ret == NULL) { @@ -153,6 +165,10 @@ void OPENSSL_cleanse(void *ptr, size_t len) { #endif // !OPENSSL_NO_ASM } +void OPENSSL_clear_free(void *ptr, size_t unused) { + OPENSSL_free(ptr); +} + int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) { const uint8_t *a = in_a; const uint8_t *b = in_b; |