diff options
author | Robert Sloan <varomodt@google.com> | 2019-03-19 02:02:05 -0700 |
---|---|---|
committer | android-build-merger <android-build-merger@google.com> | 2019-03-19 02:02:05 -0700 |
commit | 767904931a5f7012915cf015d54ca571dfb86e03 (patch) | |
tree | d5956e0da0ddbeb7e907378720fcbc8c6926beee | |
parent | 8c9200ba9943ec79d6e957b2893f9a1455208778 (diff) | |
parent | bdfba2a0b5cfa78c35c71b35bd385a9acfc3ec14 (diff) | |
download | boringssl-767904931a5f7012915cf015d54ca571dfb86e03.tar.gz |
external/boringssl: Sync to fdb48f98612e934eab339b4871484b1c987553e2. am: 9d5d1a76eb am: d54d28eca9
am: bdfba2a0b5
Change-Id: I13d1010c30643e3ad126ddadcac5f1eea83087f4
-rw-r--r-- | BORINGSSL_REVISION | 2 | ||||
-rw-r--r-- | eureka.mk | 1 | ||||
-rw-r--r-- | ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S | 337 | ||||
-rw-r--r-- | ios-arm/crypto/fipsmodule/bsaes-armv7.S | 95 | ||||
-rw-r--r-- | linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S | 339 | ||||
-rw-r--r-- | linux-arm/crypto/fipsmodule/bsaes-armv7.S | 95 | ||||
-rw-r--r-- | linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S | 53 | ||||
-rw-r--r-- | mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S | 50 | ||||
-rw-r--r-- | sources.bp | 1 | ||||
-rw-r--r-- | sources.mk | 1 | ||||
-rw-r--r-- | src/crypto/fipsmodule/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl | 95 | ||||
-rw-r--r-- | src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl | 59 | ||||
-rw-r--r-- | src/crypto/fipsmodule/aes/internal.h | 2 | ||||
-rw-r--r-- | src/crypto/fipsmodule/cipher/e_aes.c | 6 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl | 287 | ||||
-rw-r--r-- | src/crypto/fipsmodule/modes/internal.h | 16 | ||||
-rw-r--r-- | src/crypto/stack/stack.c | 63 | ||||
-rw-r--r-- | win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm | 51 |
19 files changed, 1118 insertions, 437 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION index c10534e3..42ad2f07 100644 --- a/BORINGSSL_REVISION +++ b/BORINGSSL_REVISION @@ -1 +1 @@ -35941f2923155664bd9fa5d897cb336a0ab729a1 +fdb48f98612e934eab339b4871484b1c987553e2 @@ -298,6 +298,7 @@ linux_aarch64_sources := \ linux-aarch64/crypto/chacha/chacha-armv8.S\ linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\ linux-aarch64/crypto/fipsmodule/armv8-mont.S\ + linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\ linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\ linux-aarch64/crypto/fipsmodule/sha1-armv8.S\ linux-aarch64/crypto/fipsmodule/sha256-armv8.S\ diff --git a/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S new file mode 100644 index 00000000..62bdc9a8 --- /dev/null +++ b/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S @@ -0,0 +1,337 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(BORINGSSL_PREFIX) +#include <boringssl_prefix_symbols_asm.h> +#endif +.text + +.globl _gcm_init_neon +.private_extern _gcm_init_neon + +.align 4 +_gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon + +.align 4 +_gcm_gmult_neon: + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon + +.align 4 +_gcm_ghash_neon: + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section __TEXT,__const +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM diff --git a/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/ios-arm/crypto/fipsmodule/bsaes-armv7.S index 4d4b7cc7..dffc0c24 100644 --- a/ios-arm/crypto/fipsmodule/bsaes-armv7.S +++ b/ios-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1086,12 +1086,6 @@ Lkey_loop: @ don't save last round key bx lr -@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does -@ not exist. Rather than add it, patch this fallback out. See -@ https://crbug.com/boringssl/256. - - - .globl _bsaes_cbc_encrypt .private_extern _bsaes_cbc_encrypt #ifdef __thumb2__ @@ -1099,16 +1093,8 @@ Lkey_loop: #endif .align 5 _bsaes_cbc_encrypt: -#ifndef __KERNEL__ - cmp r2, #128 -#ifndef __thumb__ - blo _AES_cbc_encrypt -#else - bhs 1f - b _AES_cbc_encrypt -1: -#endif -#endif + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. @ it is up to the caller to make sure we are called with enc == 0 @@ -1206,10 +1192,7 @@ Lcbc_dec_loop_finish: adds r2, r2, #8 beq Lcbc_dec_done - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo Lcbc_dec_one - vld1.8 {q1}, [r0]! + @ Set up most parameters for the _bsaes_decrypt8 call. #ifndef BSAES_ASM_EXTENDED_KEY mov r4, sp @ pass the key #else @@ -1217,6 +1200,11 @@ Lcbc_dec_loop_finish: #endif mov r5, r10 vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo Lcbc_dec_one + vld1.8 {q1}, [r0]! beq Lcbc_dec_two vld1.8 {q2}, [r0]! cmp r2, #4 @@ -1334,16 +1322,11 @@ Lcbc_dec_two: .align 4 Lcbc_dec_one: sub r0, r0, #0x10 - mov r10, r1 @ save original out pointer - mov r1, r9 @ use the iv scratch space as out buffer - mov r2, r3 - vmov q4,q15 @ just in case ensure that IV - vmov q5,q0 @ and input are preserved - bl _aes_nohw_decrypt - vld1.8 {q0}, [r9] @ load result - veor q0, q0, q4 @ ^= IV - vmov q15, q5 @ q5 holds input - vst1.8 {q0}, [r10] @ write output + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output Lcbc_dec_done: #ifndef BSAES_ASM_EXTENDED_KEY @@ -1361,7 +1344,6 @@ Lcbc_dec_bzero:@ wipe key schedule [if any] VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} - .globl _bsaes_ctr32_encrypt_blocks .private_extern _bsaes_ctr32_encrypt_blocks #ifdef __thumb2__ @@ -1369,9 +1351,8 @@ Lcbc_dec_bzero:@ wipe key schedule [if any] #endif .align 5 _bsaes_ctr32_encrypt_blocks: - cmp r2, #8 @ use plain AES for - blo Lctr_enc_short @ small sizes - + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} VFP_ABI_PUSH @@ -1547,50 +1528,8 @@ Lctr_enc_bzero:@ wipe key schedule [if any] VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.align 4 -Lctr_enc_short: - ldr ip, [sp] @ ctr pointer is passed on stack - stmdb sp!, {r4,r5,r6,r7,r8, lr} - - mov r4, r0 @ copy arguments - mov r5, r1 - mov r6, r2 - mov r7, r3 - ldr r8, [ip, #12] @ load counter LSW - vld1.8 {q1}, [ip] @ load whole counter value -#ifdef __ARMEL__ - rev r8, r8 -#endif - sub sp, sp, #0x10 - vst1.8 {q1}, [sp] @ copy counter value - sub sp, sp, #0x10 - -Lctr_enc_short_loop: - add r0, sp, #0x10 @ input counter value - mov r1, sp @ output on the stack - mov r2, r7 @ key - - bl _aes_nohw_encrypt - - vld1.8 {q0}, [r4]! @ load input - vld1.8 {q1}, [sp] @ load encrypted counter - add r8, r8, #1 -#ifdef __ARMEL__ - rev r0, r8 - str r0, [sp, #0x1c] @ next counter value -#else - str r8, [sp, #0x1c] @ next counter value -#endif - veor q0,q0,q1 - vst1.8 {q0}, [r5]! @ store output - subs r6, r6, #1 - bne Lctr_enc_short_loop - - vmov.i32 q0, #0 - vmov.i32 q1, #0 - vstmia sp!, {q0,q1} - - ldmia sp!, {r4,r5,r6,r7,r8, pc} + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. #endif #endif // !OPENSSL_NO_ASM diff --git a/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S new file mode 100644 index 00000000..1cfbec29 --- /dev/null +++ b/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S @@ -0,0 +1,339 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) +#if defined(__aarch64__) +#if defined(BORINGSSL_PREFIX) +#include <boringssl_prefix_symbols_asm.h> +#endif +.text + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne .Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM diff --git a/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/linux-arm/crypto/fipsmodule/bsaes-armv7.S index 20b9bb05..0ad56bc7 100644 --- a/linux-arm/crypto/fipsmodule/bsaes-armv7.S +++ b/linux-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1081,27 +1081,13 @@ _bsaes_key_convert: @ don't save last round key bx lr .size _bsaes_key_convert,.-_bsaes_key_convert -@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does -@ not exist. Rather than add it, patch this fallback out. See -@ https://crbug.com/boringssl/256. - - - .globl bsaes_cbc_encrypt .hidden bsaes_cbc_encrypt .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: -#ifndef __KERNEL__ - cmp r2, #128 -#ifndef __thumb__ - blo AES_cbc_encrypt -#else - bhs 1f - b AES_cbc_encrypt -1: -#endif -#endif + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. @ it is up to the caller to make sure we are called with enc == 0 @@ -1199,10 +1185,7 @@ bsaes_cbc_encrypt: adds r2, r2, #8 beq .Lcbc_dec_done - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo .Lcbc_dec_one - vld1.8 {q1}, [r0]! + @ Set up most parameters for the _bsaes_decrypt8 call. #ifndef BSAES_ASM_EXTENDED_KEY mov r4, sp @ pass the key #else @@ -1210,6 +1193,11 @@ bsaes_cbc_encrypt: #endif mov r5, r10 vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo .Lcbc_dec_one + vld1.8 {q1}, [r0]! beq .Lcbc_dec_two vld1.8 {q2}, [r0]! cmp r2, #4 @@ -1327,16 +1315,11 @@ bsaes_cbc_encrypt: .align 4 .Lcbc_dec_one: sub r0, r0, #0x10 - mov r10, r1 @ save original out pointer - mov r1, r9 @ use the iv scratch space as out buffer - mov r2, r3 - vmov q4,q15 @ just in case ensure that IV - vmov q5,q0 @ and input are preserved - bl aes_nohw_decrypt - vld1.8 {q0}, [r9] @ load result - veor q0, q0, q4 @ ^= IV - vmov q15, q5 @ q5 holds input - vst1.8 {q0}, [r10] @ write output + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output .Lcbc_dec_done: #ifndef BSAES_ASM_EXTENDED_KEY @@ -1354,15 +1337,13 @@ bsaes_cbc_encrypt: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt - .globl bsaes_ctr32_encrypt_blocks .hidden bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: - cmp r2, #8 @ use plain AES for - blo .Lctr_enc_short @ small sizes - + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} VFP_ABI_PUSH @@ -1538,50 +1519,8 @@ bsaes_ctr32_encrypt_blocks: VFP_ABI_POP ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return -.align 4 -.Lctr_enc_short: - ldr ip, [sp] @ ctr pointer is passed on stack - stmdb sp!, {r4,r5,r6,r7,r8, lr} - - mov r4, r0 @ copy arguments - mov r5, r1 - mov r6, r2 - mov r7, r3 - ldr r8, [ip, #12] @ load counter .LSW - vld1.8 {q1}, [ip] @ load whole counter value -#ifdef __ARMEL__ - rev r8, r8 -#endif - sub sp, sp, #0x10 - vst1.8 {q1}, [sp] @ copy counter value - sub sp, sp, #0x10 - -.Lctr_enc_short_loop: - add r0, sp, #0x10 @ input counter value - mov r1, sp @ output on the stack - mov r2, r7 @ key - - bl aes_nohw_encrypt - - vld1.8 {q0}, [r4]! @ load input - vld1.8 {q1}, [sp] @ load encrypted counter - add r8, r8, #1 -#ifdef __ARMEL__ - rev r0, r8 - str r0, [sp, #0x1c] @ next counter value -#else - str r8, [sp, #0x1c] @ next counter value -#endif - veor q0,q0,q1 - vst1.8 {q0}, [r5]! @ store output - subs r6, r6, #1 - bne .Lctr_enc_short_loop - - vmov.i32 q0, #0 - vmov.i32 q1, #0 - vstmia sp!, {q0,q1} - - ldmia sp!, {r4,r5,r6,r7,r8, pc} + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks #endif #endif diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S index 5236aa66..5437762f 100644 --- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -13,11 +13,6 @@ #endif .text -.extern aes_nohw_encrypt -.hidden aes_nohw_encrypt -.extern aes_nohw_decrypt -.hidden aes_nohw_decrypt - .type _bsaes_encrypt8,@function .align 64 _bsaes_encrypt8: @@ -1083,18 +1078,14 @@ _bsaes_key_convert: .byte 0xf3,0xc3 .cfi_endproc .size _bsaes_key_convert,.-_bsaes_key_convert -.extern aes_nohw_cbc_encrypt -.hidden aes_nohw_cbc_encrypt .globl bsaes_cbc_encrypt .hidden bsaes_cbc_encrypt .type bsaes_cbc_encrypt,@function .align 16 bsaes_cbc_encrypt: .cfi_startproc - cmpl $0,%r9d - jne aes_nohw_cbc_encrypt - cmpq $128,%rdx - jb aes_nohw_cbc_encrypt + + movq %rsp,%rax .Lcbc_dec_prologue: @@ -1143,6 +1134,8 @@ bsaes_cbc_encrypt: movdqu (%rbx),%xmm14 subq $8,%r14 + jc .Lcbc_dec_loop_done + .Lcbc_dec_loop: movdqu 0(%r12),%xmm15 movdqu 16(%r12),%xmm0 @@ -1187,6 +1180,7 @@ bsaes_cbc_encrypt: subq $8,%r14 jnc .Lcbc_dec_loop +.Lcbc_dec_loop_done: addq $8,%r14 jz .Lcbc_dec_done @@ -1319,13 +1313,12 @@ bsaes_cbc_encrypt: jmp .Lcbc_dec_done .align 16 .Lcbc_dec_one: - leaq (%r12),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call aes_nohw_decrypt - pxor 32(%rbp),%xmm14 - movdqu %xmm14,(%r13) - movdqa %xmm15,%xmm14 + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm14 + movdqu %xmm15,0(%r13) + jmp .Lcbc_dec_done .Lcbc_dec_done: movdqu %xmm14,(%rbx) @@ -1403,8 +1396,8 @@ bsaes_ctr32_encrypt_blocks: movq %rdx,%r14 movq %rcx,%r15 movdqa %xmm0,32(%rbp) - cmpq $8,%rdx - jb .Lctr_enc_short + + movl %eax,%ebx shlq $7,%rax @@ -1538,26 +1531,8 @@ bsaes_ctr32_encrypt_blocks: movdqu 96(%r12),%xmm13 pxor %xmm13,%xmm1 movdqu %xmm1,96(%r13) - jmp .Lctr_enc_done -.align 16 -.Lctr_enc_short: - leaq 32(%rbp),%rdi - leaq 48(%rbp),%rsi - leaq (%r15),%rdx - call aes_nohw_encrypt - movdqu (%r12),%xmm0 - leaq 16(%r12),%r12 - movl 44(%rbp),%eax - bswapl %eax - pxor 48(%rbp),%xmm0 - incl %eax - movdqu %xmm0,(%r13) - bswapl %eax - leaq 16(%r13),%r13 - movl %eax,44(%rsp) - decq %r14 - jnz .Lctr_enc_short + .Lctr_enc_done: leaq (%rsp),%rax diff --git a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S index 5a65960d..c2807e38 100644 --- a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -14,9 +14,6 @@ .text - - - .p2align 6 _bsaes_encrypt8: @@ -1081,17 +1078,14 @@ L$key_loop: .byte 0xf3,0xc3 - .globl _bsaes_cbc_encrypt .private_extern _bsaes_cbc_encrypt .p2align 4 _bsaes_cbc_encrypt: - cmpl $0,%r9d - jne _aes_nohw_cbc_encrypt - cmpq $128,%rdx - jb _aes_nohw_cbc_encrypt + + movq %rsp,%rax L$cbc_dec_prologue: @@ -1134,6 +1128,8 @@ L$cbc_dec_prologue: movdqu (%rbx),%xmm14 subq $8,%r14 + jc L$cbc_dec_loop_done + L$cbc_dec_loop: movdqu 0(%r12),%xmm15 movdqu 16(%r12),%xmm0 @@ -1178,6 +1174,7 @@ L$cbc_dec_loop: subq $8,%r14 jnc L$cbc_dec_loop +L$cbc_dec_loop_done: addq $8,%r14 jz L$cbc_dec_done @@ -1310,13 +1307,12 @@ L$cbc_dec_two: jmp L$cbc_dec_done .p2align 4 L$cbc_dec_one: - leaq (%r12),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _aes_nohw_decrypt - pxor 32(%rbp),%xmm14 - movdqu %xmm14,(%r13) - movdqa %xmm15,%xmm14 + movdqa %xmm14,32(%rbp) + call _bsaes_decrypt8 + pxor 32(%rbp),%xmm15 + movdqu 0(%r12),%xmm14 + movdqu %xmm15,0(%r13) + jmp L$cbc_dec_done L$cbc_dec_done: movdqu %xmm14,(%rbx) @@ -1387,8 +1383,8 @@ L$ctr_enc_prologue: movq %rdx,%r14 movq %rcx,%r15 movdqa %xmm0,32(%rbp) - cmpq $8,%rdx - jb L$ctr_enc_short + + movl %eax,%ebx shlq $7,%rax @@ -1522,26 +1518,8 @@ L$ctr_enc_loop_done: movdqu 96(%r12),%xmm13 pxor %xmm13,%xmm1 movdqu %xmm1,96(%r13) - jmp L$ctr_enc_done -.p2align 4 -L$ctr_enc_short: - leaq 32(%rbp),%rdi - leaq 48(%rbp),%rsi - leaq (%r15),%rdx - call _aes_nohw_encrypt - movdqu (%r12),%xmm0 - leaq 16(%r12),%r12 - movl 44(%rbp),%eax - bswapl %eax - pxor 48(%rbp),%xmm0 - incl %eax - movdqu %xmm0,(%r13) - bswapl %eax - leaq 16(%r13),%r13 - movl %eax,44(%rsp) - decq %r14 - jnz L$ctr_enc_short + L$ctr_enc_done: leaq (%rsp),%rax @@ -247,6 +247,7 @@ cc_defaults { "linux-aarch64/crypto/chacha/chacha-armv8.S", "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S", "linux-aarch64/crypto/fipsmodule/armv8-mont.S", + "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S", "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S", "linux-aarch64/crypto/fipsmodule/sha1-armv8.S", "linux-aarch64/crypto/fipsmodule/sha256-armv8.S", @@ -243,6 +243,7 @@ linux_aarch64_sources := \ linux-aarch64/crypto/chacha/chacha-armv8.S\ linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\ linux-aarch64/crypto/fipsmodule/armv8-mont.S\ + linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\ linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\ linux-aarch64/crypto/fipsmodule/sha1-armv8.S\ linux-aarch64/crypto/fipsmodule/sha256-armv8.S\ diff --git a/src/crypto/fipsmodule/CMakeLists.txt b/src/crypto/fipsmodule/CMakeLists.txt index 09d210bf..fbf25ac8 100644 --- a/src/crypto/fipsmodule/CMakeLists.txt +++ b/src/crypto/fipsmodule/CMakeLists.txt @@ -65,6 +65,7 @@ if(${ARCH} STREQUAL "aarch64") aesv8-armx.${ASM_EXT} armv8-mont.${ASM_EXT} + ghash-neon-armv8.${ASM_EXT} ghashv8-armx.${ASM_EXT} sha1-armv8.${ASM_EXT} sha256-armv8.${ASM_EXT} @@ -99,6 +100,7 @@ perlasm(co-586.${ASM_EXT} bn/asm/co-586.pl) perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl) perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl) perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl) +perlasm(ghash-neon-armv8.${ASM_EXT} modes/asm/ghash-neon-armv8.pl) perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl) perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl) perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl) diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl index 11607d11..d4db3b4d 100644 --- a/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl +++ b/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl @@ -1113,26 +1113,12 @@ my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10)); my ($keysched)=("sp"); $code.=<<___; -@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does -@ not exist. Rather than add it, patch this fallback out. See -@ https://crbug.com/boringssl/256. -.extern AES_cbc_encrypt -.extern aes_nohw_decrypt - .global bsaes_cbc_encrypt .type bsaes_cbc_encrypt,%function .align 5 bsaes_cbc_encrypt: -#ifndef __KERNEL__ - cmp $len, #128 -#ifndef __thumb__ - blo AES_cbc_encrypt -#else - bhs 1f - b AES_cbc_encrypt -1: -#endif -#endif + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. @ it is up to the caller to make sure we are called with enc == 0 @@ -1230,10 +1216,7 @@ bsaes_cbc_encrypt: adds $len, $len, #8 beq .Lcbc_dec_done - vld1.8 {@XMM[0]}, [$inp]! @ load input - cmp $len, #2 - blo .Lcbc_dec_one - vld1.8 {@XMM[1]}, [$inp]! + @ Set up most parameters for the _bsaes_decrypt8 call. #ifndef BSAES_ASM_EXTENDED_KEY mov r4, $keysched @ pass the key #else @@ -1241,6 +1224,11 @@ bsaes_cbc_encrypt: #endif mov r5, $rounds vstmia $fp, {@XMM[15]} @ put aside IV + + vld1.8 {@XMM[0]}, [$inp]! @ load input + cmp $len, #2 + blo .Lcbc_dec_one + vld1.8 {@XMM[1]}, [$inp]! beq .Lcbc_dec_two vld1.8 {@XMM[2]}, [$inp]! cmp $len, #4 @@ -1358,16 +1346,11 @@ bsaes_cbc_encrypt: .align 4 .Lcbc_dec_one: sub $inp, $inp, #0x10 - mov $rounds, $out @ save original out pointer - mov $out, $fp @ use the iv scratch space as out buffer - mov r2, $key - vmov @XMM[4],@XMM[15] @ just in case ensure that IV - vmov @XMM[5],@XMM[0] @ and input are preserved - bl aes_nohw_decrypt - vld1.8 {@XMM[0]}, [$fp] @ load result - veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV - vmov @XMM[15], @XMM[5] @ @XMM[5] holds input - vst1.8 {@XMM[0]}, [$rounds] @ write output + bl _bsaes_decrypt8 + vldmia $fp, {@XMM[14]} @ reload IV + vld1.8 {@XMM[15]}, [$inp]! @ reload input + veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV + vst1.8 {@XMM[0]}, [$out]! @ write output .Lcbc_dec_done: #ifndef BSAES_ASM_EXTENDED_KEY @@ -1393,14 +1376,12 @@ my $const = "r6"; # shared with _bsaes_encrypt8_alt my $keysched = "sp"; $code.=<<___; -.extern aes_nohw_encrypt .global bsaes_ctr32_encrypt_blocks .type bsaes_ctr32_encrypt_blocks,%function .align 5 bsaes_ctr32_encrypt_blocks: - cmp $len, #8 @ use plain AES for - blo .Lctr_enc_short @ small sizes - + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. mov ip, sp stmdb sp!, {r4-r10, lr} VFP_ABI_PUSH @@ -1576,50 +1557,8 @@ bsaes_ctr32_encrypt_blocks: VFP_ABI_POP ldmia sp!, {r4-r10, pc} @ return -.align 4 -.Lctr_enc_short: - ldr ip, [sp] @ ctr pointer is passed on stack - stmdb sp!, {r4-r8, lr} - - mov r4, $inp @ copy arguments - mov r5, $out - mov r6, $len - mov r7, $key - ldr r8, [ip, #12] @ load counter LSW - vld1.8 {@XMM[1]}, [ip] @ load whole counter value -#ifdef __ARMEL__ - rev r8, r8 -#endif - sub sp, sp, #0x10 - vst1.8 {@XMM[1]}, [sp] @ copy counter value - sub sp, sp, #0x10 - -.Lctr_enc_short_loop: - add r0, sp, #0x10 @ input counter value - mov r1, sp @ output on the stack - mov r2, r7 @ key - - bl aes_nohw_encrypt - - vld1.8 {@XMM[0]}, [r4]! @ load input - vld1.8 {@XMM[1]}, [sp] @ load encrypted counter - add r8, r8, #1 -#ifdef __ARMEL__ - rev r0, r8 - str r0, [sp, #0x1c] @ next counter value -#else - str r8, [sp, #0x1c] @ next counter value -#endif - veor @XMM[0],@XMM[0],@XMM[1] - vst1.8 {@XMM[0]}, [r5]! @ store output - subs r6, r6, #1 - bne .Lctr_enc_short_loop - - vmov.i32 q0, #0 - vmov.i32 q1, #0 - vstmia sp!, {q0-q1} - - ldmia sp!, {r4-r8, pc} + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks ___ } diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl index 81331bfa..3bb28190 100644 --- a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl +++ b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl @@ -811,9 +811,6 @@ ___ $code.=<<___; .text -.extern aes_nohw_encrypt -.extern aes_nohw_decrypt - .type _bsaes_encrypt8,\@abi-omnipotent .align 64 _bsaes_encrypt8: @@ -1609,22 +1606,14 @@ $code.=<<___; ___ } $code.=<<___; -.extern aes_nohw_cbc_encrypt .globl bsaes_cbc_encrypt .type bsaes_cbc_encrypt,\@abi-omnipotent .align 16 bsaes_cbc_encrypt: .cfi_startproc -___ -$code.=<<___ if ($win64); - mov 48(%rsp),$arg6 # pull direction flag -___ -$code.=<<___; - cmp \$0,$arg6 - jne aes_nohw_cbc_encrypt - cmp \$128,$arg3 - jb aes_nohw_cbc_encrypt - + # In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + # short inputs or if enc is one. We patch this out, using bsaes for all + # input sizes. The caller is required to ensure enc is zero. mov %rsp, %rax .Lcbc_dec_prologue: push %rbp @@ -1683,6 +1672,8 @@ $code.=<<___; movdqu (%rbx), @XMM[15] # load IV sub \$8,$len + jc .Lcbc_dec_loop_done + .Lcbc_dec_loop: movdqu 0x00($inp), @XMM[0] # load input movdqu 0x10($inp), @XMM[1] @@ -1727,6 +1718,7 @@ $code.=<<___; sub \$8,$len jnc .Lcbc_dec_loop +.Lcbc_dec_loop_done: add \$8,$len jz .Lcbc_dec_done @@ -1859,13 +1851,12 @@ $code.=<<___; jmp .Lcbc_dec_done .align 16 .Lcbc_dec_one: - lea ($inp), $arg1 - lea 0x20(%rbp), $arg2 # buffer output - lea ($key), $arg3 - call aes_nohw_decrypt # doesn't touch %xmm - pxor 0x20(%rbp), @XMM[15] # ^= IV - movdqu @XMM[15], ($out) # write output - movdqa @XMM[0], @XMM[15] # IV + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[15] # IV + movdqu @XMM[0], 0x00($out) # write output + jmp .Lcbc_dec_done .Lcbc_dec_done: movdqu @XMM[15], (%rbx) # return IV @@ -1968,8 +1959,8 @@ $code.=<<___; mov $arg3, $len mov $arg4, $key movdqa %xmm0, 0x20(%rbp) # copy counter - cmp \$8, $arg3 - jb .Lctr_enc_short + # In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + # out to retain a constant-time implementation. mov %eax, %ebx # rounds shl \$7, %rax # 128 bytes per inner round key @@ -2103,27 +2094,9 @@ $code.=<<___; movdqu 0x60($inp), @XMM[14] pxor @XMM[14], @XMM[2] movdqu @XMM[2], 0x60($out) - jmp .Lctr_enc_done - -.align 16 -.Lctr_enc_short: - lea 0x20(%rbp), $arg1 - lea 0x30(%rbp), $arg2 - lea ($key), $arg3 - call aes_nohw_encrypt - movdqu ($inp), @XMM[1] - lea 16($inp), $inp - mov 0x2c(%rbp), %eax # load 32-bit counter - bswap %eax - pxor 0x30(%rbp), @XMM[1] - inc %eax # increment - movdqu @XMM[1], ($out) - bswap %eax - lea 16($out), $out - mov %eax, 0x2c(%rsp) # save 32-bit counter - dec $len - jnz .Lctr_enc_short + # OpenSSL contains aes_nohw_* fallback code here. We patch this + # out to retain a constant-time implementation. .Lctr_enc_done: lea (%rsp), %rax pxor %xmm0, %xmm0 diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h index a05abcbf..63070bc6 100644 --- a/src/crypto/fipsmodule/aes/internal.h +++ b/src/crypto/fipsmodule/aes/internal.h @@ -133,7 +133,7 @@ void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length, #if defined(BSAES) // On platforms where BSAES gets defined (just above), then these functions are -// provided by asm. +// provided by asm. Note |bsaes_cbc_encrypt| requires |enc| to be zero. void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, const AES_KEY *key, uint8_t ivec[16], int enc); void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c index 51a1fb1c..a1859d74 100644 --- a/src/crypto/fipsmodule/cipher/e_aes.c +++ b/src/crypto/fipsmodule/cipher/e_aes.c @@ -111,7 +111,8 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, } } else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) { ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = aes_nohw_decrypt; + // If |dat->stream.cbc| is provided, |dat->block| is never used. + dat->block = NULL; dat->stream.cbc = bsaes_cbc_encrypt; } else if (vpaes_capable()) { ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); @@ -138,7 +139,8 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, } } else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) { ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = aes_nohw_encrypt; + // If |dat->stream.ctr| is provided, |dat->block| is never used. + dat->block = NULL; dat->stream.ctr = bsaes_ctr32_encrypt_blocks; } else if (vpaes_capable()) { ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); diff --git a/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl new file mode 100644 index 00000000..972be419 --- /dev/null +++ b/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl @@ -0,0 +1,287 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It +# implements the multiplication algorithm described in: +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf +# +# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is +# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit +# NEON, the low and high halves of the 128-bit register q0 are accessible as +# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of +# vN. Where the 32-bit version would use the upper half, this file must keep +# halves in separate registers. +# +# The other distinction is in syntax. 32-bit NEON embeds lane information in the +# instruction name, while AArch64 uses suffixes on the registers. For instance, +# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: +# +# vshl.i64 q0, q0, #1 +# +# in 64-bit, it would be written: +# +# shl v0.2d, v0.2d, #1 +# +# See Programmer's Guide for ARMv8-A, section 7 for details. +# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf +# +# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ +# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials +# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit +# polynomial and is conditioned on the PMULL extension. This file emulates the +# latter with the former. + +use strict; + +my $flavour = shift; +my $output; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; + my $dir = $1; + my $xlate; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block +my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); +my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); +# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers +# to spare. +my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); +my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); +my ($k48_k32, $k16_k0) = map("v$_", (24..25)); + +my $code = ""; + +# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b +# must be distinct from $t* and $k*. $t* are clobbered by the emitted code. +sub clmul64x64 { +my ($r, $a, $b) = @_; +$code .= <<___; + ext $t0.8b, $a.8b, $a.8b, #1 // A1 + pmull $t0.8h, $t0.8b, $b.8b // F = A1*B + ext $r.8b, $b.8b, $b.8b, #1 // B1 + pmull $r.8h, $a.8b, $r.8b // E = A*B1 + ext $t1.8b, $a.8b, $a.8b, #2 // A2 + pmull $t1.8h, $t1.8b, $b.8b // H = A2*B + ext $t3.8b, $b.8b, $b.8b, #2 // B2 + pmull $t3.8h, $a.8b, $t3.8b // G = A*B2 + ext $t2.8b, $a.8b, $a.8b, #3 // A3 + eor $t0.16b, $t0.16b, $r.16b // L = E + F + pmull $t2.8h, $t2.8b, $b.8b // J = A3*B + ext $r.8b, $b.8b, $b.8b, #3 // B3 + eor $t1.16b, $t1.16b, $t3.16b // M = G + H + pmull $r.8h, $a.8b, $r.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L) + // vand \$t0#hi, \$t0#hi, \$k48 + // veor \$t0#lo, \$t0#lo, \$t0#hi + // + // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M) + // vand \$t1#hi, \$t1#hi, \$k32 + // veor \$t1#lo, \$t1#lo, \$t1#hi + // + // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N) + // vand \$t2#hi, \$t2#hi, \$k16 + // veor \$t2#lo, \$t2#lo, \$t2#hi + // + // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 \$t3#hi, #0 + // + // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext $t3.8b, $b.8b, $b.8b, #4 // B4 + eor $t2.16b, $t2.16b, $r.16b // N = I + J + pmull $t3.8h, $a.8b, $t3.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 $t0l_t1l.2d, $t0.2d, $t1.2d + zip1 $t2l_t3l.2d, $t2.2d, $t3.2d + zip2 $t0h_t1h.2d, $t0.2d, $t1.2d + zip2 $t2h_t3h.2d, $t2.2d, $t3.2d + eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b + eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b + and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b + and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b + eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b + eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b + zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d + zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d + zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d + zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d + + ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8 + ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16 + pmull $r.8h, $a.8b, $b.8b // D = A*B + ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32 + ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24 + eor $t0.16b, $t0.16b, $t1.16b + eor $t2.16b, $t2.16b, $t3.16b + eor $r.16b, $r.16b, $t0.16b + eor $r.16b, $r.16b, $t2.16b +___ +} + +$code .= <<___; +.text + +.global gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {$t1.2d}, [x1] // load H + movi $t3.16b, #0xe1 + shl $t3.2d, $t3.2d, #57 // 0xc2.0 + ext $INlo.16b, $t1.16b, $t1.16b, #8 + ushr $t2.2d, $t3.2d, #63 + dup $t1.4s, $t1.s[1] + ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01 + ushr $t2.2d, $INlo.2d, #63 + sshr $t1.4s, $t1.4s, #31 // broadcast carry bit + and $t2.16b, $t2.16b, $t0.16b + shl $INlo.2d, $INlo.2d, #1 + ext $t2.16b, $t2.16b, $t2.16b, #8 + and $t0.16b, $t0.16b, $t1.16b + orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1 + eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H + st1 {$Hlo.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + ld1 {$INlo.16b}, [$Xi] // load Xi + ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H + ld1 {$Hhi.1d}, [$Htbl] + adrp x9, :pg_hi21:.Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] + rev64 $INlo.16b, $INlo.16b // byteswap Xi + ext $INlo.16b, $INlo.16b, $INlo.16b, #8 + eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing + + mov $len, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + ld1 {$Xl.16b}, [$Xi] // load Xi + ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H + ld1 {$Hhi.1d}, [$Htbl] + adrp x9, :pg_hi21:.Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] + rev64 $Xl.16b, $Xl.16b // byteswap Xi + ext $Xl.16b, $Xl.16b, $Xl.16b, #8 + eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {$INlo.16b}, [$inp], #16 // load inp + rev64 $INlo.16b, $INlo.16b // byteswap inp + ext $INlo.16b, $INlo.16b, $INlo.16b, #8 + eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into $INlo and $INhi. (The upper halves are unused, + // so it is okay to leave them alone.) + ins $INhi.d[0], $INlo.d[1] +___ +&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo +$code .= <<___; + eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing +___ +&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi) +&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi +$code .= <<___; + ext $t0.16b, $Xl.16b, $Xh.16b, #8 + eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing + eor $Xm.16b, $Xm.16b, $Xh.16b + eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi + ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins $Xh.d[0], $Xm.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl $t1.2d, $Xl.2d, #57 // 1st phase + shl $t2.2d, $Xl.2d, #62 + eor $t2.16b, $t2.16b, $t1.16b // + shl $t1.2d, $Xl.2d, #63 + eor $t2.16b, $t2.16b, $t1.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor $t2.16b, $t2.16b, $Xm.16b + ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0] + ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1] + + ushr $t2.2d, $Xl.2d, #1 // 2nd phase + eor $Xh.16b, $Xh.16b,$Xl.16b + eor $Xl.16b, $Xl.16b,$t2.16b // + ushr $t2.2d, $t2.2d, #6 + ushr $Xl.2d, $Xl.2d, #1 // + eor $Xl.16b, $Xl.16b, $Xh.16b // + eor $Xl.16b, $Xl.16b, $t2.16b // + + subs $len, $len, #16 + bne .Loop_neon + + rev64 $Xl.16b, $Xl.16b // byteswap Xi and write + ext $Xl.16b, $Xl.16b, $Xl.16b, #8 + st1 {$Xl.16b}, [$Xi] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>" +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + print $_,"\n"; +} +close STDOUT; # enforce flush diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h index 9a081ebd..dec1e56c 100644 --- a/src/crypto/fipsmodule/modes/internal.h +++ b/src/crypto/fipsmodule/modes/internal.h @@ -327,28 +327,12 @@ void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len); -#if defined(OPENSSL_ARM) -// 32-bit ARM also has support for doing GCM with NEON instructions. OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); } void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]); void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]); void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len); -#else -// AArch64 only has the ARMv8 versions of functions. -OPENSSL_INLINE int gcm_neon_capable(void) { return 0; } -OPENSSL_INLINE void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) { - abort(); -} -OPENSSL_INLINE void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) { - abort(); -} -OPENSSL_INLINE void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], - const uint8_t *inp, size_t len) { - abort(); -} -#endif // OPENSSL_ARM #elif defined(OPENSSL_PPC64LE) #define GHASH_ASM_PPC64LE diff --git a/src/crypto/stack/stack.c b/src/crypto/stack/stack.c index ec557c02..599bd7b1 100644 --- a/src/crypto/stack/stack.c +++ b/src/crypto/stack/stack.c @@ -56,6 +56,7 @@ #include <openssl/stack.h> +#include <assert.h> #include <string.h> #include <openssl/mem.h> @@ -272,36 +273,39 @@ int sk_find(const _STACK *sk, size_t *out_index, const void *p, return 0; } - // sk->comp is a function that takes pointers to pointers to elements, but - // qsort and bsearch take a comparison function that just takes pointers to - // elements. However, since we're passing an array of pointers to - // qsort/bsearch, we can just cast the comparison function and everything - // works. + // The stack is sorted, so binary search to find the element. // - // TODO(davidben): This is undefined behavior, but the call is in libc so, - // e.g., CFI does not notice. Unfortunately, |bsearch| is missing a void* - // parameter in its callback and |bsearch_s| is a mess of incompatibility. - const void *const *r = bsearch(&p, sk->data, sk->num, sizeof(void *), - (int (*)(const void *, const void *))sk->comp); - if (r == NULL) { - return 0; - } - size_t idx = ((void **)r) - sk->data; - // This function always returns the first result. Note this logic is, in the - // worst case, O(N) rather than O(log(N)). If this ever becomes a problem, - // restore https://boringssl-review.googlesource.com/c/boringssl/+/32115/ - // which integrates the preference into the binary search. - while (idx > 0) { - const void *elem = sk->data[idx - 1]; - if (call_cmp_func(sk->comp, &p, &elem) != 0) { - break; + // |lo| and |hi| maintain a half-open interval of where the answer may be. All + // indices such that |lo <= idx < hi| are candidates. + size_t lo = 0, hi = sk->num; + while (lo < hi) { + // Bias |mid| towards |lo|. See the |r == 0| case below. + size_t mid = lo + (hi - lo - 1) / 2; + assert(lo <= mid && mid < hi); + const void *elem = sk->data[mid]; + int r = call_cmp_func(sk->comp, &p, &elem); + if (r > 0) { + lo = mid + 1; // |mid| is too low. + } else if (r < 0) { + hi = mid; // |mid| is too high. + } else { + // |mid| matches. However, this function returns the earliest match, so we + // can only return if the range has size one. + if (hi - lo == 1) { + if (out_index != NULL) { + *out_index = mid; + } + return 1; + } + // The sample is biased towards |lo|. |mid| can only be |hi - 1| if + // |hi - lo| was one, so this makes forward progress. + assert(mid + 1 < hi); + hi = mid + 1; } - idx--; - } - if (out_index) { - *out_index = idx; } - return 1; + + assert(lo == hi); + return 0; // Not found. } void *sk_shift(_STACK *sk) { @@ -362,7 +366,10 @@ void sk_sort(_STACK *sk) { return; } - // See the comment in sk_find about this cast. + // sk->comp is a function that takes pointers to pointers to elements, but + // qsort take a comparison function that just takes pointers to elements. + // However, since we're passing an array of pointers to qsort, we can just + // cast the comparison function and everything works. // // TODO(davidben): This is undefined behavior, but the call is in libc so, // e.g., CFI does not notice. Unfortunately, |qsort| is missing a void* diff --git a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm index bb5e4c09..5fa4053e 100644 --- a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm +++ b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm @@ -12,9 +12,6 @@ default rel section .text code align=64 -EXTERN aes_nohw_encrypt -EXTERN aes_nohw_decrypt - ALIGN 64 _bsaes_encrypt8: @@ -1080,17 +1077,13 @@ DB 102,15,56,0,244 DB 0F3h,0C3h ;repret -EXTERN aes_nohw_cbc_encrypt global bsaes_cbc_encrypt ALIGN 16 bsaes_cbc_encrypt: - mov r11d,DWORD[48+rsp] - cmp r11d,0 - jne NEAR aes_nohw_cbc_encrypt - cmp r8,128 - jb NEAR aes_nohw_cbc_encrypt + + mov rax,rsp $L$cbc_dec_prologue: @@ -1146,6 +1139,8 @@ $L$cbc_dec_body: movdqu xmm14,XMMWORD[rbx] sub r14,8 + jc NEAR $L$cbc_dec_loop_done + $L$cbc_dec_loop: movdqu xmm15,XMMWORD[r12] movdqu xmm0,XMMWORD[16+r12] @@ -1190,6 +1185,7 @@ $L$cbc_dec_loop: sub r14,8 jnc NEAR $L$cbc_dec_loop +$L$cbc_dec_loop_done: add r14,8 jz NEAR $L$cbc_dec_done @@ -1322,13 +1318,12 @@ $L$cbc_dec_two: jmp NEAR $L$cbc_dec_done ALIGN 16 $L$cbc_dec_one: - lea rcx,[r12] - lea rdx,[32+rbp] - lea r8,[r15] - call aes_nohw_decrypt - pxor xmm14,XMMWORD[32+rbp] - movdqu XMMWORD[r13],xmm14 - movdqa xmm14,xmm15 + movdqa XMMWORD[32+rbp],xmm14 + call _bsaes_decrypt8 + pxor xmm15,XMMWORD[32+rbp] + movdqu xmm14,XMMWORD[r12] + movdqu XMMWORD[r13],xmm15 + jmp NEAR $L$cbc_dec_done $L$cbc_dec_done: movdqu XMMWORD[rbx],xmm14 @@ -1423,8 +1418,8 @@ $L$ctr_enc_body: mov r14,r8 mov r15,r9 movdqa XMMWORD[32+rbp],xmm0 - cmp r8,8 - jb NEAR $L$ctr_enc_short + + mov ebx,eax shl rax,7 @@ -1558,26 +1553,8 @@ $L$ctr_enc_loop_done: movdqu xmm13,XMMWORD[96+r12] pxor xmm1,xmm13 movdqu XMMWORD[96+r13],xmm1 - jmp NEAR $L$ctr_enc_done -ALIGN 16 -$L$ctr_enc_short: - lea rcx,[32+rbp] - lea rdx,[48+rbp] - lea r8,[r15] - call aes_nohw_encrypt - movdqu xmm0,XMMWORD[r12] - lea r12,[16+r12] - mov eax,DWORD[44+rbp] - bswap eax - pxor xmm0,XMMWORD[48+rbp] - inc eax - movdqu XMMWORD[r13],xmm0 - bswap eax - lea r13,[16+r13] - mov DWORD[44+rsp],eax - dec r14 - jnz NEAR $L$ctr_enc_short + $L$ctr_enc_done: lea rax,[rsp] |