summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Sloan <varomodt@google.com>2019-03-19 02:02:05 -0700
committerandroid-build-merger <android-build-merger@google.com>2019-03-19 02:02:05 -0700
commit767904931a5f7012915cf015d54ca571dfb86e03 (patch)
treed5956e0da0ddbeb7e907378720fcbc8c6926beee
parent8c9200ba9943ec79d6e957b2893f9a1455208778 (diff)
parentbdfba2a0b5cfa78c35c71b35bd385a9acfc3ec14 (diff)
downloadboringssl-767904931a5f7012915cf015d54ca571dfb86e03.tar.gz
external/boringssl: Sync to fdb48f98612e934eab339b4871484b1c987553e2. am: 9d5d1a76eb am: d54d28eca9
am: bdfba2a0b5 Change-Id: I13d1010c30643e3ad126ddadcac5f1eea83087f4
-rw-r--r--BORINGSSL_REVISION2
-rw-r--r--eureka.mk1
-rw-r--r--ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S337
-rw-r--r--ios-arm/crypto/fipsmodule/bsaes-armv7.S95
-rw-r--r--linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S339
-rw-r--r--linux-arm/crypto/fipsmodule/bsaes-armv7.S95
-rw-r--r--linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S53
-rw-r--r--mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S50
-rw-r--r--sources.bp1
-rw-r--r--sources.mk1
-rw-r--r--src/crypto/fipsmodule/CMakeLists.txt2
-rw-r--r--src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl95
-rw-r--r--src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl59
-rw-r--r--src/crypto/fipsmodule/aes/internal.h2
-rw-r--r--src/crypto/fipsmodule/cipher/e_aes.c6
-rw-r--r--src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl287
-rw-r--r--src/crypto/fipsmodule/modes/internal.h16
-rw-r--r--src/crypto/stack/stack.c63
-rw-r--r--win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm51
19 files changed, 1118 insertions, 437 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index c10534e3..42ad2f07 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-35941f2923155664bd9fa5d897cb336a0ab729a1
+fdb48f98612e934eab339b4871484b1c987553e2
diff --git a/eureka.mk b/eureka.mk
index b0f09c94..43de1776 100644
--- a/eureka.mk
+++ b/eureka.mk
@@ -298,6 +298,7 @@ linux_aarch64_sources := \
linux-aarch64/crypto/chacha/chacha-armv8.S\
linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\
linux-aarch64/crypto/fipsmodule/armv8-mont.S\
+ linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\
linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\
linux-aarch64/crypto/fipsmodule/sha1-armv8.S\
linux-aarch64/crypto/fipsmodule/sha256-armv8.S\
diff --git a/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
new file mode 100644
index 00000000..62bdc9a8
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,337 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.globl _gcm_init_neon
+.private_extern _gcm_init_neon
+
+.align 4
+_gcm_init_neon:
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+
+
+.globl _gcm_gmult_neon
+.private_extern _gcm_gmult_neon
+
+.align 4
+_gcm_gmult_neon:
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks@PAGE // load constants
+ add x9, x9, Lmasks@PAGEOFF
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b Lgmult_neon
+
+
+.globl _gcm_ghash_neon
+.private_extern _gcm_ghash_neon
+
+.align 4
+_gcm_ghash_neon:
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks@PAGE // load constants
+ add x9, x9, Lmasks@PAGEOFF
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+
+
+.section __TEXT,__const
+.align 4
+Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM
diff --git a/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/ios-arm/crypto/fipsmodule/bsaes-armv7.S
index 4d4b7cc7..dffc0c24 100644
--- a/ios-arm/crypto/fipsmodule/bsaes-armv7.S
+++ b/ios-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -1086,12 +1086,6 @@ Lkey_loop:
@ don't save last round key
bx lr
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-
-
-
.globl _bsaes_cbc_encrypt
.private_extern _bsaes_cbc_encrypt
#ifdef __thumb2__
@@ -1099,16 +1093,8 @@ Lkey_loop:
#endif
.align 5
_bsaes_cbc_encrypt:
-#ifndef __KERNEL__
- cmp r2, #128
-#ifndef __thumb__
- blo _AES_cbc_encrypt
-#else
- bhs 1f
- b _AES_cbc_encrypt
-1:
-#endif
-#endif
+ @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ @ short inputs. We patch this out, using bsaes for all input sizes.
@ it is up to the caller to make sure we are called with enc == 0
@@ -1206,10 +1192,7 @@ Lcbc_dec_loop_finish:
adds r2, r2, #8
beq Lcbc_dec_done
- vld1.8 {q0}, [r0]! @ load input
- cmp r2, #2
- blo Lcbc_dec_one
- vld1.8 {q1}, [r0]!
+ @ Set up most parameters for the _bsaes_decrypt8 call.
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, sp @ pass the key
#else
@@ -1217,6 +1200,11 @@ Lcbc_dec_loop_finish:
#endif
mov r5, r10
vstmia r9, {q15} @ put aside IV
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
beq Lcbc_dec_two
vld1.8 {q2}, [r0]!
cmp r2, #4
@@ -1334,16 +1322,11 @@ Lcbc_dec_two:
.align 4
Lcbc_dec_one:
sub r0, r0, #0x10
- mov r10, r1 @ save original out pointer
- mov r1, r9 @ use the iv scratch space as out buffer
- mov r2, r3
- vmov q4,q15 @ just in case ensure that IV
- vmov q5,q0 @ and input are preserved
- bl _aes_nohw_decrypt
- vld1.8 {q0}, [r9] @ load result
- veor q0, q0, q4 @ ^= IV
- vmov q15, q5 @ q5 holds input
- vst1.8 {q0}, [r10] @ write output
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vst1.8 {q0}, [r1]! @ write output
Lcbc_dec_done:
#ifndef BSAES_ASM_EXTENDED_KEY
@@ -1361,7 +1344,6 @@ Lcbc_dec_bzero:@ wipe key schedule [if any]
VFP_ABI_POP
ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
-
.globl _bsaes_ctr32_encrypt_blocks
.private_extern _bsaes_ctr32_encrypt_blocks
#ifdef __thumb2__
@@ -1369,9 +1351,8 @@ Lcbc_dec_bzero:@ wipe key schedule [if any]
#endif
.align 5
_bsaes_ctr32_encrypt_blocks:
- cmp r2, #8 @ use plain AES for
- blo Lctr_enc_short @ small sizes
-
+ @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ @ out to retain a constant-time implementation.
mov ip, sp
stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
VFP_ABI_PUSH
@@ -1547,50 +1528,8 @@ Lctr_enc_bzero:@ wipe key schedule [if any]
VFP_ABI_POP
ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
-.align 4
-Lctr_enc_short:
- ldr ip, [sp] @ ctr pointer is passed on stack
- stmdb sp!, {r4,r5,r6,r7,r8, lr}
-
- mov r4, r0 @ copy arguments
- mov r5, r1
- mov r6, r2
- mov r7, r3
- ldr r8, [ip, #12] @ load counter LSW
- vld1.8 {q1}, [ip] @ load whole counter value
-#ifdef __ARMEL__
- rev r8, r8
-#endif
- sub sp, sp, #0x10
- vst1.8 {q1}, [sp] @ copy counter value
- sub sp, sp, #0x10
-
-Lctr_enc_short_loop:
- add r0, sp, #0x10 @ input counter value
- mov r1, sp @ output on the stack
- mov r2, r7 @ key
-
- bl _aes_nohw_encrypt
-
- vld1.8 {q0}, [r4]! @ load input
- vld1.8 {q1}, [sp] @ load encrypted counter
- add r8, r8, #1
-#ifdef __ARMEL__
- rev r0, r8
- str r0, [sp, #0x1c] @ next counter value
-#else
- str r8, [sp, #0x1c] @ next counter value
-#endif
- veor q0,q0,q1
- vst1.8 {q0}, [r5]! @ store output
- subs r6, r6, #1
- bne Lctr_enc_short_loop
-
- vmov.i32 q0, #0
- vmov.i32 q1, #0
- vstmia sp!, {q0,q1}
-
- ldmia sp!, {r4,r5,r6,r7,r8, pc}
+ @ OpenSSL contains aes_nohw_* fallback code here. We patch this
+ @ out to retain a constant-time implementation.
#endif
#endif // !OPENSSL_NO_ASM
diff --git a/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
new file mode 100644
index 00000000..1cfbec29
--- /dev/null
+++ b/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,339 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.globl gcm_init_neon
+.hidden gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+.size gcm_init_neon,.-gcm_init_neon
+
+.globl gcm_gmult_neon
+.hidden gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, .Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl gcm_ghash_neon
+.hidden gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, .Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+.Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+.Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne .Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+.size gcm_ghash_neon,.-gcm_ghash_neon
+
+.section .rodata
+.align 4
+.Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/linux-arm/crypto/fipsmodule/bsaes-armv7.S
index 20b9bb05..0ad56bc7 100644
--- a/linux-arm/crypto/fipsmodule/bsaes-armv7.S
+++ b/linux-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -1081,27 +1081,13 @@ _bsaes_key_convert:
@ don't save last round key
bx lr
.size _bsaes_key_convert,.-_bsaes_key_convert
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-
-
-
.globl bsaes_cbc_encrypt
.hidden bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,%function
.align 5
bsaes_cbc_encrypt:
-#ifndef __KERNEL__
- cmp r2, #128
-#ifndef __thumb__
- blo AES_cbc_encrypt
-#else
- bhs 1f
- b AES_cbc_encrypt
-1:
-#endif
-#endif
+ @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ @ short inputs. We patch this out, using bsaes for all input sizes.
@ it is up to the caller to make sure we are called with enc == 0
@@ -1199,10 +1185,7 @@ bsaes_cbc_encrypt:
adds r2, r2, #8
beq .Lcbc_dec_done
- vld1.8 {q0}, [r0]! @ load input
- cmp r2, #2
- blo .Lcbc_dec_one
- vld1.8 {q1}, [r0]!
+ @ Set up most parameters for the _bsaes_decrypt8 call.
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, sp @ pass the key
#else
@@ -1210,6 +1193,11 @@ bsaes_cbc_encrypt:
#endif
mov r5, r10
vstmia r9, {q15} @ put aside IV
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo .Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
beq .Lcbc_dec_two
vld1.8 {q2}, [r0]!
cmp r2, #4
@@ -1327,16 +1315,11 @@ bsaes_cbc_encrypt:
.align 4
.Lcbc_dec_one:
sub r0, r0, #0x10
- mov r10, r1 @ save original out pointer
- mov r1, r9 @ use the iv scratch space as out buffer
- mov r2, r3
- vmov q4,q15 @ just in case ensure that IV
- vmov q5,q0 @ and input are preserved
- bl aes_nohw_decrypt
- vld1.8 {q0}, [r9] @ load result
- veor q0, q0, q4 @ ^= IV
- vmov q15, q5 @ q5 holds input
- vst1.8 {q0}, [r10] @ write output
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vst1.8 {q0}, [r1]! @ write output
.Lcbc_dec_done:
#ifndef BSAES_ASM_EXTENDED_KEY
@@ -1354,15 +1337,13 @@ bsaes_cbc_encrypt:
VFP_ABI_POP
ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-
.globl bsaes_ctr32_encrypt_blocks
.hidden bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,%function
.align 5
bsaes_ctr32_encrypt_blocks:
- cmp r2, #8 @ use plain AES for
- blo .Lctr_enc_short @ small sizes
-
+ @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ @ out to retain a constant-time implementation.
mov ip, sp
stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
VFP_ABI_PUSH
@@ -1538,50 +1519,8 @@ bsaes_ctr32_encrypt_blocks:
VFP_ABI_POP
ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
-.align 4
-.Lctr_enc_short:
- ldr ip, [sp] @ ctr pointer is passed on stack
- stmdb sp!, {r4,r5,r6,r7,r8, lr}
-
- mov r4, r0 @ copy arguments
- mov r5, r1
- mov r6, r2
- mov r7, r3
- ldr r8, [ip, #12] @ load counter .LSW
- vld1.8 {q1}, [ip] @ load whole counter value
-#ifdef __ARMEL__
- rev r8, r8
-#endif
- sub sp, sp, #0x10
- vst1.8 {q1}, [sp] @ copy counter value
- sub sp, sp, #0x10
-
-.Lctr_enc_short_loop:
- add r0, sp, #0x10 @ input counter value
- mov r1, sp @ output on the stack
- mov r2, r7 @ key
-
- bl aes_nohw_encrypt
-
- vld1.8 {q0}, [r4]! @ load input
- vld1.8 {q1}, [sp] @ load encrypted counter
- add r8, r8, #1
-#ifdef __ARMEL__
- rev r0, r8
- str r0, [sp, #0x1c] @ next counter value
-#else
- str r8, [sp, #0x1c] @ next counter value
-#endif
- veor q0,q0,q1
- vst1.8 {q0}, [r5]! @ store output
- subs r6, r6, #1
- bne .Lctr_enc_short_loop
-
- vmov.i32 q0, #0
- vmov.i32 q1, #0
- vstmia sp!, {q0,q1}
-
- ldmia sp!, {r4,r5,r6,r7,r8, pc}
+ @ OpenSSL contains aes_nohw_* fallback code here. We patch this
+ @ out to retain a constant-time implementation.
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
#endif
#endif
diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index 5236aa66..5437762f 100644
--- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -13,11 +13,6 @@
#endif
.text
-.extern aes_nohw_encrypt
-.hidden aes_nohw_encrypt
-.extern aes_nohw_decrypt
-.hidden aes_nohw_decrypt
-
.type _bsaes_encrypt8,@function
.align 64
_bsaes_encrypt8:
@@ -1083,18 +1078,14 @@ _bsaes_key_convert:
.byte 0xf3,0xc3
.cfi_endproc
.size _bsaes_key_convert,.-_bsaes_key_convert
-.extern aes_nohw_cbc_encrypt
-.hidden aes_nohw_cbc_encrypt
.globl bsaes_cbc_encrypt
.hidden bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,@function
.align 16
bsaes_cbc_encrypt:
.cfi_startproc
- cmpl $0,%r9d
- jne aes_nohw_cbc_encrypt
- cmpq $128,%rdx
- jb aes_nohw_cbc_encrypt
+
+
movq %rsp,%rax
.Lcbc_dec_prologue:
@@ -1143,6 +1134,8 @@ bsaes_cbc_encrypt:
movdqu (%rbx),%xmm14
subq $8,%r14
+ jc .Lcbc_dec_loop_done
+
.Lcbc_dec_loop:
movdqu 0(%r12),%xmm15
movdqu 16(%r12),%xmm0
@@ -1187,6 +1180,7 @@ bsaes_cbc_encrypt:
subq $8,%r14
jnc .Lcbc_dec_loop
+.Lcbc_dec_loop_done:
addq $8,%r14
jz .Lcbc_dec_done
@@ -1319,13 +1313,12 @@ bsaes_cbc_encrypt:
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_one:
- leaq (%r12),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r15),%rdx
- call aes_nohw_decrypt
- pxor 32(%rbp),%xmm14
- movdqu %xmm14,(%r13)
- movdqa %xmm15,%xmm14
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm14
+ movdqu %xmm15,0(%r13)
+ jmp .Lcbc_dec_done
.Lcbc_dec_done:
movdqu %xmm14,(%rbx)
@@ -1403,8 +1396,8 @@ bsaes_ctr32_encrypt_blocks:
movq %rdx,%r14
movq %rcx,%r15
movdqa %xmm0,32(%rbp)
- cmpq $8,%rdx
- jb .Lctr_enc_short
+
+
movl %eax,%ebx
shlq $7,%rax
@@ -1538,26 +1531,8 @@ bsaes_ctr32_encrypt_blocks:
movdqu 96(%r12),%xmm13
pxor %xmm13,%xmm1
movdqu %xmm1,96(%r13)
- jmp .Lctr_enc_done
-.align 16
-.Lctr_enc_short:
- leaq 32(%rbp),%rdi
- leaq 48(%rbp),%rsi
- leaq (%r15),%rdx
- call aes_nohw_encrypt
- movdqu (%r12),%xmm0
- leaq 16(%r12),%r12
- movl 44(%rbp),%eax
- bswapl %eax
- pxor 48(%rbp),%xmm0
- incl %eax
- movdqu %xmm0,(%r13)
- bswapl %eax
- leaq 16(%r13),%r13
- movl %eax,44(%rsp)
- decq %r14
- jnz .Lctr_enc_short
+
.Lctr_enc_done:
leaq (%rsp),%rax
diff --git a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index 5a65960d..c2807e38 100644
--- a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -14,9 +14,6 @@
.text
-
-
-
.p2align 6
_bsaes_encrypt8:
@@ -1081,17 +1078,14 @@ L$key_loop:
.byte 0xf3,0xc3
-
.globl _bsaes_cbc_encrypt
.private_extern _bsaes_cbc_encrypt
.p2align 4
_bsaes_cbc_encrypt:
- cmpl $0,%r9d
- jne _aes_nohw_cbc_encrypt
- cmpq $128,%rdx
- jb _aes_nohw_cbc_encrypt
+
+
movq %rsp,%rax
L$cbc_dec_prologue:
@@ -1134,6 +1128,8 @@ L$cbc_dec_prologue:
movdqu (%rbx),%xmm14
subq $8,%r14
+ jc L$cbc_dec_loop_done
+
L$cbc_dec_loop:
movdqu 0(%r12),%xmm15
movdqu 16(%r12),%xmm0
@@ -1178,6 +1174,7 @@ L$cbc_dec_loop:
subq $8,%r14
jnc L$cbc_dec_loop
+L$cbc_dec_loop_done:
addq $8,%r14
jz L$cbc_dec_done
@@ -1310,13 +1307,12 @@ L$cbc_dec_two:
jmp L$cbc_dec_done
.p2align 4
L$cbc_dec_one:
- leaq (%r12),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r15),%rdx
- call _aes_nohw_decrypt
- pxor 32(%rbp),%xmm14
- movdqu %xmm14,(%r13)
- movdqa %xmm15,%xmm14
+ movdqa %xmm14,32(%rbp)
+ call _bsaes_decrypt8
+ pxor 32(%rbp),%xmm15
+ movdqu 0(%r12),%xmm14
+ movdqu %xmm15,0(%r13)
+ jmp L$cbc_dec_done
L$cbc_dec_done:
movdqu %xmm14,(%rbx)
@@ -1387,8 +1383,8 @@ L$ctr_enc_prologue:
movq %rdx,%r14
movq %rcx,%r15
movdqa %xmm0,32(%rbp)
- cmpq $8,%rdx
- jb L$ctr_enc_short
+
+
movl %eax,%ebx
shlq $7,%rax
@@ -1522,26 +1518,8 @@ L$ctr_enc_loop_done:
movdqu 96(%r12),%xmm13
pxor %xmm13,%xmm1
movdqu %xmm1,96(%r13)
- jmp L$ctr_enc_done
-.p2align 4
-L$ctr_enc_short:
- leaq 32(%rbp),%rdi
- leaq 48(%rbp),%rsi
- leaq (%r15),%rdx
- call _aes_nohw_encrypt
- movdqu (%r12),%xmm0
- leaq 16(%r12),%r12
- movl 44(%rbp),%eax
- bswapl %eax
- pxor 48(%rbp),%xmm0
- incl %eax
- movdqu %xmm0,(%r13)
- bswapl %eax
- leaq 16(%r13),%r13
- movl %eax,44(%rsp)
- decq %r14
- jnz L$ctr_enc_short
+
L$ctr_enc_done:
leaq (%rsp),%rax
diff --git a/sources.bp b/sources.bp
index d704e3c5..25e406fa 100644
--- a/sources.bp
+++ b/sources.bp
@@ -247,6 +247,7 @@ cc_defaults {
"linux-aarch64/crypto/chacha/chacha-armv8.S",
"linux-aarch64/crypto/fipsmodule/aesv8-armx64.S",
"linux-aarch64/crypto/fipsmodule/armv8-mont.S",
+ "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
"linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
"linux-aarch64/crypto/fipsmodule/sha1-armv8.S",
"linux-aarch64/crypto/fipsmodule/sha256-armv8.S",
diff --git a/sources.mk b/sources.mk
index fb0679ba..617ac2ad 100644
--- a/sources.mk
+++ b/sources.mk
@@ -243,6 +243,7 @@ linux_aarch64_sources := \
linux-aarch64/crypto/chacha/chacha-armv8.S\
linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\
linux-aarch64/crypto/fipsmodule/armv8-mont.S\
+ linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\
linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\
linux-aarch64/crypto/fipsmodule/sha1-armv8.S\
linux-aarch64/crypto/fipsmodule/sha256-armv8.S\
diff --git a/src/crypto/fipsmodule/CMakeLists.txt b/src/crypto/fipsmodule/CMakeLists.txt
index 09d210bf..fbf25ac8 100644
--- a/src/crypto/fipsmodule/CMakeLists.txt
+++ b/src/crypto/fipsmodule/CMakeLists.txt
@@ -65,6 +65,7 @@ if(${ARCH} STREQUAL "aarch64")
aesv8-armx.${ASM_EXT}
armv8-mont.${ASM_EXT}
+ ghash-neon-armv8.${ASM_EXT}
ghashv8-armx.${ASM_EXT}
sha1-armv8.${ASM_EXT}
sha256-armv8.${ASM_EXT}
@@ -99,6 +100,7 @@ perlasm(co-586.${ASM_EXT} bn/asm/co-586.pl)
perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl)
perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl)
perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl)
+perlasm(ghash-neon-armv8.${ASM_EXT} modes/asm/ghash-neon-armv8.pl)
perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl)
perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl)
perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl)
diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
index 11607d11..d4db3b4d 100644
--- a/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
+++ b/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
@@ -1113,26 +1113,12 @@ my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
my ($keysched)=("sp");
$code.=<<___;
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-.extern AES_cbc_encrypt
-.extern aes_nohw_decrypt
-
.global bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,%function
.align 5
bsaes_cbc_encrypt:
-#ifndef __KERNEL__
- cmp $len, #128
-#ifndef __thumb__
- blo AES_cbc_encrypt
-#else
- bhs 1f
- b AES_cbc_encrypt
-1:
-#endif
-#endif
+ @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ @ short inputs. We patch this out, using bsaes for all input sizes.
@ it is up to the caller to make sure we are called with enc == 0
@@ -1230,10 +1216,7 @@ bsaes_cbc_encrypt:
adds $len, $len, #8
beq .Lcbc_dec_done
- vld1.8 {@XMM[0]}, [$inp]! @ load input
- cmp $len, #2
- blo .Lcbc_dec_one
- vld1.8 {@XMM[1]}, [$inp]!
+ @ Set up most parameters for the _bsaes_decrypt8 call.
#ifndef BSAES_ASM_EXTENDED_KEY
mov r4, $keysched @ pass the key
#else
@@ -1241,6 +1224,11 @@ bsaes_cbc_encrypt:
#endif
mov r5, $rounds
vstmia $fp, {@XMM[15]} @ put aside IV
+
+ vld1.8 {@XMM[0]}, [$inp]! @ load input
+ cmp $len, #2
+ blo .Lcbc_dec_one
+ vld1.8 {@XMM[1]}, [$inp]!
beq .Lcbc_dec_two
vld1.8 {@XMM[2]}, [$inp]!
cmp $len, #4
@@ -1358,16 +1346,11 @@ bsaes_cbc_encrypt:
.align 4
.Lcbc_dec_one:
sub $inp, $inp, #0x10
- mov $rounds, $out @ save original out pointer
- mov $out, $fp @ use the iv scratch space as out buffer
- mov r2, $key
- vmov @XMM[4],@XMM[15] @ just in case ensure that IV
- vmov @XMM[5],@XMM[0] @ and input are preserved
- bl aes_nohw_decrypt
- vld1.8 {@XMM[0]}, [$fp] @ load result
- veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
- vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
- vst1.8 {@XMM[0]}, [$rounds] @ write output
+ bl _bsaes_decrypt8
+ vldmia $fp, {@XMM[14]} @ reload IV
+ vld1.8 {@XMM[15]}, [$inp]! @ reload input
+ veor @XMM[0], @XMM[0], @XMM[14] @ ^= IV
+ vst1.8 {@XMM[0]}, [$out]! @ write output
.Lcbc_dec_done:
#ifndef BSAES_ASM_EXTENDED_KEY
@@ -1393,14 +1376,12 @@ my $const = "r6"; # shared with _bsaes_encrypt8_alt
my $keysched = "sp";
$code.=<<___;
-.extern aes_nohw_encrypt
.global bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,%function
.align 5
bsaes_ctr32_encrypt_blocks:
- cmp $len, #8 @ use plain AES for
- blo .Lctr_enc_short @ small sizes
-
+ @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ @ out to retain a constant-time implementation.
mov ip, sp
stmdb sp!, {r4-r10, lr}
VFP_ABI_PUSH
@@ -1576,50 +1557,8 @@ bsaes_ctr32_encrypt_blocks:
VFP_ABI_POP
ldmia sp!, {r4-r10, pc} @ return
-.align 4
-.Lctr_enc_short:
- ldr ip, [sp] @ ctr pointer is passed on stack
- stmdb sp!, {r4-r8, lr}
-
- mov r4, $inp @ copy arguments
- mov r5, $out
- mov r6, $len
- mov r7, $key
- ldr r8, [ip, #12] @ load counter LSW
- vld1.8 {@XMM[1]}, [ip] @ load whole counter value
-#ifdef __ARMEL__
- rev r8, r8
-#endif
- sub sp, sp, #0x10
- vst1.8 {@XMM[1]}, [sp] @ copy counter value
- sub sp, sp, #0x10
-
-.Lctr_enc_short_loop:
- add r0, sp, #0x10 @ input counter value
- mov r1, sp @ output on the stack
- mov r2, r7 @ key
-
- bl aes_nohw_encrypt
-
- vld1.8 {@XMM[0]}, [r4]! @ load input
- vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
- add r8, r8, #1
-#ifdef __ARMEL__
- rev r0, r8
- str r0, [sp, #0x1c] @ next counter value
-#else
- str r8, [sp, #0x1c] @ next counter value
-#endif
- veor @XMM[0],@XMM[0],@XMM[1]
- vst1.8 {@XMM[0]}, [r5]! @ store output
- subs r6, r6, #1
- bne .Lctr_enc_short_loop
-
- vmov.i32 q0, #0
- vmov.i32 q1, #0
- vstmia sp!, {q0-q1}
-
- ldmia sp!, {r4-r8, pc}
+ @ OpenSSL contains aes_nohw_* fallback code here. We patch this
+ @ out to retain a constant-time implementation.
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
___
}
diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
index 81331bfa..3bb28190 100644
--- a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
+++ b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
@@ -811,9 +811,6 @@ ___
$code.=<<___;
.text
-.extern aes_nohw_encrypt
-.extern aes_nohw_decrypt
-
.type _bsaes_encrypt8,\@abi-omnipotent
.align 64
_bsaes_encrypt8:
@@ -1609,22 +1606,14 @@ $code.=<<___;
___
}
$code.=<<___;
-.extern aes_nohw_cbc_encrypt
.globl bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,\@abi-omnipotent
.align 16
bsaes_cbc_encrypt:
.cfi_startproc
-___
-$code.=<<___ if ($win64);
- mov 48(%rsp),$arg6 # pull direction flag
-___
-$code.=<<___;
- cmp \$0,$arg6
- jne aes_nohw_cbc_encrypt
- cmp \$128,$arg3
- jb aes_nohw_cbc_encrypt
-
+ # In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ # short inputs or if enc is one. We patch this out, using bsaes for all
+ # input sizes. The caller is required to ensure enc is zero.
mov %rsp, %rax
.Lcbc_dec_prologue:
push %rbp
@@ -1683,6 +1672,8 @@ $code.=<<___;
movdqu (%rbx), @XMM[15] # load IV
sub \$8,$len
+ jc .Lcbc_dec_loop_done
+
.Lcbc_dec_loop:
movdqu 0x00($inp), @XMM[0] # load input
movdqu 0x10($inp), @XMM[1]
@@ -1727,6 +1718,7 @@ $code.=<<___;
sub \$8,$len
jnc .Lcbc_dec_loop
+.Lcbc_dec_loop_done:
add \$8,$len
jz .Lcbc_dec_done
@@ -1859,13 +1851,12 @@ $code.=<<___;
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_one:
- lea ($inp), $arg1
- lea 0x20(%rbp), $arg2 # buffer output
- lea ($key), $arg3
- call aes_nohw_decrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[15] # ^= IV
- movdqu @XMM[15], ($out) # write output
- movdqa @XMM[0], @XMM[15] # IV
+ movdqa @XMM[15], 0x20(%rbp) # put aside IV
+ call _bsaes_decrypt8
+ pxor 0x20(%rbp), @XMM[0] # ^= IV
+ movdqu 0x00($inp), @XMM[15] # IV
+ movdqu @XMM[0], 0x00($out) # write output
+ jmp .Lcbc_dec_done
.Lcbc_dec_done:
movdqu @XMM[15], (%rbx) # return IV
@@ -1968,8 +1959,8 @@ $code.=<<___;
mov $arg3, $len
mov $arg4, $key
movdqa %xmm0, 0x20(%rbp) # copy counter
- cmp \$8, $arg3
- jb .Lctr_enc_short
+ # In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ # out to retain a constant-time implementation.
mov %eax, %ebx # rounds
shl \$7, %rax # 128 bytes per inner round key
@@ -2103,27 +2094,9 @@ $code.=<<___;
movdqu 0x60($inp), @XMM[14]
pxor @XMM[14], @XMM[2]
movdqu @XMM[2], 0x60($out)
- jmp .Lctr_enc_done
-
-.align 16
-.Lctr_enc_short:
- lea 0x20(%rbp), $arg1
- lea 0x30(%rbp), $arg2
- lea ($key), $arg3
- call aes_nohw_encrypt
- movdqu ($inp), @XMM[1]
- lea 16($inp), $inp
- mov 0x2c(%rbp), %eax # load 32-bit counter
- bswap %eax
- pxor 0x30(%rbp), @XMM[1]
- inc %eax # increment
- movdqu @XMM[1], ($out)
- bswap %eax
- lea 16($out), $out
- mov %eax, 0x2c(%rsp) # save 32-bit counter
- dec $len
- jnz .Lctr_enc_short
+ # OpenSSL contains aes_nohw_* fallback code here. We patch this
+ # out to retain a constant-time implementation.
.Lctr_enc_done:
lea (%rsp), %rax
pxor %xmm0, %xmm0
diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h
index a05abcbf..63070bc6 100644
--- a/src/crypto/fipsmodule/aes/internal.h
+++ b/src/crypto/fipsmodule/aes/internal.h
@@ -133,7 +133,7 @@ void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length,
#if defined(BSAES)
// On platforms where BSAES gets defined (just above), then these functions are
-// provided by asm.
+// provided by asm. Note |bsaes_cbc_encrypt| requires |enc| to be zero.
void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
const AES_KEY *key, uint8_t ivec[16], int enc);
void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c
index 51a1fb1c..a1859d74 100644
--- a/src/crypto/fipsmodule/cipher/e_aes.c
+++ b/src/crypto/fipsmodule/cipher/e_aes.c
@@ -111,7 +111,8 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
}
} else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
- dat->block = aes_nohw_decrypt;
+ // If |dat->stream.cbc| is provided, |dat->block| is never used.
+ dat->block = NULL;
dat->stream.cbc = bsaes_cbc_encrypt;
} else if (vpaes_capable()) {
ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
@@ -138,7 +139,8 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
}
} else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
- dat->block = aes_nohw_encrypt;
+ // If |dat->stream.ctr| is provided, |dat->block| is never used.
+ dat->block = NULL;
dat->stream.ctr = bsaes_ctr32_encrypt_blocks;
} else if (vpaes_capable()) {
ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
diff --git a/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
new file mode 100644
index 00000000..972be419
--- /dev/null
+++ b/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,287 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
+# implements the multiplication algorithm described in:
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+#
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
+# vN. Where the 32-bit version would use the upper half, this file must keep
+# halves in separate registers.
+#
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
+#
+# vshl.i64 q0, q0, #1
+#
+# in 64-bit, it would be written:
+#
+# shl v0.2d, v0.2d, #1
+#
+# See Programmer's Guide for ARMv8-A, section 7 for details.
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
+#
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
+# polynomial and is conditioned on the PMULL extension. This file emulates the
+# latter with the former.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/;
+ my $dir = $1;
+ my $xlate;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
+# to spare.
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
+
+my $code = "";
+
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
+sub clmul64x64 {
+my ($r, $a, $b) = @_;
+$code .= <<___;
+ ext $t0.8b, $a.8b, $a.8b, #1 // A1
+ pmull $t0.8h, $t0.8b, $b.8b // F = A1*B
+ ext $r.8b, $b.8b, $b.8b, #1 // B1
+ pmull $r.8h, $a.8b, $r.8b // E = A*B1
+ ext $t1.8b, $a.8b, $a.8b, #2 // A2
+ pmull $t1.8h, $t1.8b, $b.8b // H = A2*B
+ ext $t3.8b, $b.8b, $b.8b, #2 // B2
+ pmull $t3.8h, $a.8b, $t3.8b // G = A*B2
+ ext $t2.8b, $a.8b, $a.8b, #3 // A3
+ eor $t0.16b, $t0.16b, $r.16b // L = E + F
+ pmull $t2.8h, $t2.8b, $b.8b // J = A3*B
+ ext $r.8b, $b.8b, $b.8b, #3 // B3
+ eor $t1.16b, $t1.16b, $t3.16b // M = G + H
+ pmull $r.8h, $a.8b, $r.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L)
+ // vand \$t0#hi, \$t0#hi, \$k48
+ // veor \$t0#lo, \$t0#lo, \$t0#hi
+ //
+ // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M)
+ // vand \$t1#hi, \$t1#hi, \$k32
+ // veor \$t1#lo, \$t1#lo, \$t1#hi
+ //
+ // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N)
+ // vand \$t2#hi, \$t2#hi, \$k16
+ // veor \$t2#lo, \$t2#lo, \$t2#hi
+ //
+ // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 \$t3#hi, #0
+ //
+ // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext $t3.8b, $b.8b, $b.8b, #4 // B4
+ eor $t2.16b, $t2.16b, $r.16b // N = I + J
+ pmull $t3.8h, $a.8b, $t3.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 $t0l_t1l.2d, $t0.2d, $t1.2d
+ zip1 $t2l_t3l.2d, $t2.2d, $t3.2d
+ zip2 $t0h_t1h.2d, $t0.2d, $t1.2d
+ zip2 $t2h_t3h.2d, $t2.2d, $t3.2d
+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+ and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
+ and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+ zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
+ zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
+ zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
+ zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
+
+ ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8
+ ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16
+ pmull $r.8h, $a.8b, $b.8b // D = A*B
+ ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32
+ ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24
+ eor $t0.16b, $t0.16b, $t1.16b
+ eor $t2.16b, $t2.16b, $t3.16b
+ eor $r.16b, $r.16b, $t0.16b
+ eor $r.16b, $r.16b, $t2.16b
+___
+}
+
+$code .= <<___;
+.text
+
+.global gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {$t1.2d}, [x1] // load H
+ movi $t3.16b, #0xe1
+ shl $t3.2d, $t3.2d, #57 // 0xc2.0
+ ext $INlo.16b, $t1.16b, $t1.16b, #8
+ ushr $t2.2d, $t3.2d, #63
+ dup $t1.4s, $t1.s[1]
+ ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01
+ ushr $t2.2d, $INlo.2d, #63
+ sshr $t1.4s, $t1.4s, #31 // broadcast carry bit
+ and $t2.16b, $t2.16b, $t0.16b
+ shl $INlo.2d, $INlo.2d, #1
+ ext $t2.16b, $t2.16b, $t2.16b, #8
+ and $t0.16b, $t0.16b, $t1.16b
+ orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1
+ eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H
+ st1 {$Hlo.2d}, [x0] // store Htable[0]
+ ret
+.size gcm_init_neon,.-gcm_init_neon
+
+.global gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ ld1 {$INlo.16b}, [$Xi] // load Xi
+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
+ ld1 {$Hhi.1d}, [$Htbl]
+ adrp x9, :pg_hi21:.Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
+ rev64 $INlo.16b, $INlo.16b // byteswap Xi
+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8
+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
+
+ mov $len, #16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.global gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ ld1 {$Xl.16b}, [$Xi] // load Xi
+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
+ ld1 {$Hhi.1d}, [$Htbl]
+ adrp x9, :pg_hi21:.Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
+ rev64 $Xl.16b, $Xl.16b // byteswap Xi
+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8
+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
+
+.Loop_neon:
+ ld1 {$INlo.16b}, [$inp], #16 // load inp
+ rev64 $INlo.16b, $INlo.16b // byteswap inp
+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8
+ eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi
+
+.Lgmult_neon:
+ // Split the input into $INlo and $INhi. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins $INhi.d[0], $INlo.d[1]
+___
+&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo
+$code .= <<___;
+ eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing
+___
+&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi
+$code .= <<___;
+ ext $t0.16b, $Xl.16b, $Xh.16b, #8
+ eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing
+ eor $Xm.16b, $Xm.16b, $Xh.16b
+ eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi
+ ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins $Xh.d[0], $Xm.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl $t1.2d, $Xl.2d, #57 // 1st phase
+ shl $t2.2d, $Xl.2d, #62
+ eor $t2.16b, $t2.16b, $t1.16b //
+ shl $t1.2d, $Xl.2d, #63
+ eor $t2.16b, $t2.16b, $t1.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor $t2.16b, $t2.16b, $Xm.16b
+ ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0]
+ ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr $t2.2d, $Xl.2d, #1 // 2nd phase
+ eor $Xh.16b, $Xh.16b,$Xl.16b
+ eor $Xl.16b, $Xl.16b,$t2.16b //
+ ushr $t2.2d, $t2.2d, #6
+ ushr $Xl.2d, $Xl.2d, #1 //
+ eor $Xl.16b, $Xl.16b, $Xh.16b //
+ eor $Xl.16b, $Xl.16b, $t2.16b //
+
+ subs $len, $len, #16
+ bne .Loop_neon
+
+ rev64 $Xl.16b, $Xl.16b // byteswap Xi and write
+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8
+ st1 {$Xl.16b}, [$Xi]
+
+ ret
+.size gcm_ghash_neon,.-gcm_ghash_neon
+
+.section .rodata
+.align 4
+.Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h
index 9a081ebd..dec1e56c 100644
--- a/src/crypto/fipsmodule/modes/internal.h
+++ b/src/crypto/fipsmodule/modes/internal.h
@@ -327,28 +327,12 @@ void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
-#if defined(OPENSSL_ARM)
-// 32-bit ARM also has support for doing GCM with NEON instructions.
OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); }
void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
-#else
-// AArch64 only has the ARMv8 versions of functions.
-OPENSSL_INLINE int gcm_neon_capable(void) { return 0; }
-OPENSSL_INLINE void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
- abort();
-}
-OPENSSL_INLINE void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
- abort();
-}
-OPENSSL_INLINE void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16],
- const uint8_t *inp, size_t len) {
- abort();
-}
-#endif // OPENSSL_ARM
#elif defined(OPENSSL_PPC64LE)
#define GHASH_ASM_PPC64LE
diff --git a/src/crypto/stack/stack.c b/src/crypto/stack/stack.c
index ec557c02..599bd7b1 100644
--- a/src/crypto/stack/stack.c
+++ b/src/crypto/stack/stack.c
@@ -56,6 +56,7 @@
#include <openssl/stack.h>
+#include <assert.h>
#include <string.h>
#include <openssl/mem.h>
@@ -272,36 +273,39 @@ int sk_find(const _STACK *sk, size_t *out_index, const void *p,
return 0;
}
- // sk->comp is a function that takes pointers to pointers to elements, but
- // qsort and bsearch take a comparison function that just takes pointers to
- // elements. However, since we're passing an array of pointers to
- // qsort/bsearch, we can just cast the comparison function and everything
- // works.
+ // The stack is sorted, so binary search to find the element.
//
- // TODO(davidben): This is undefined behavior, but the call is in libc so,
- // e.g., CFI does not notice. Unfortunately, |bsearch| is missing a void*
- // parameter in its callback and |bsearch_s| is a mess of incompatibility.
- const void *const *r = bsearch(&p, sk->data, sk->num, sizeof(void *),
- (int (*)(const void *, const void *))sk->comp);
- if (r == NULL) {
- return 0;
- }
- size_t idx = ((void **)r) - sk->data;
- // This function always returns the first result. Note this logic is, in the
- // worst case, O(N) rather than O(log(N)). If this ever becomes a problem,
- // restore https://boringssl-review.googlesource.com/c/boringssl/+/32115/
- // which integrates the preference into the binary search.
- while (idx > 0) {
- const void *elem = sk->data[idx - 1];
- if (call_cmp_func(sk->comp, &p, &elem) != 0) {
- break;
+ // |lo| and |hi| maintain a half-open interval of where the answer may be. All
+ // indices such that |lo <= idx < hi| are candidates.
+ size_t lo = 0, hi = sk->num;
+ while (lo < hi) {
+ // Bias |mid| towards |lo|. See the |r == 0| case below.
+ size_t mid = lo + (hi - lo - 1) / 2;
+ assert(lo <= mid && mid < hi);
+ const void *elem = sk->data[mid];
+ int r = call_cmp_func(sk->comp, &p, &elem);
+ if (r > 0) {
+ lo = mid + 1; // |mid| is too low.
+ } else if (r < 0) {
+ hi = mid; // |mid| is too high.
+ } else {
+ // |mid| matches. However, this function returns the earliest match, so we
+ // can only return if the range has size one.
+ if (hi - lo == 1) {
+ if (out_index != NULL) {
+ *out_index = mid;
+ }
+ return 1;
+ }
+ // The sample is biased towards |lo|. |mid| can only be |hi - 1| if
+ // |hi - lo| was one, so this makes forward progress.
+ assert(mid + 1 < hi);
+ hi = mid + 1;
}
- idx--;
- }
- if (out_index) {
- *out_index = idx;
}
- return 1;
+
+ assert(lo == hi);
+ return 0; // Not found.
}
void *sk_shift(_STACK *sk) {
@@ -362,7 +366,10 @@ void sk_sort(_STACK *sk) {
return;
}
- // See the comment in sk_find about this cast.
+ // sk->comp is a function that takes pointers to pointers to elements, but
+ // qsort take a comparison function that just takes pointers to elements.
+ // However, since we're passing an array of pointers to qsort, we can just
+ // cast the comparison function and everything works.
//
// TODO(davidben): This is undefined behavior, but the call is in libc so,
// e.g., CFI does not notice. Unfortunately, |qsort| is missing a void*
diff --git a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
index bb5e4c09..5fa4053e 100644
--- a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
+++ b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
@@ -12,9 +12,6 @@ default rel
section .text code align=64
-EXTERN aes_nohw_encrypt
-EXTERN aes_nohw_decrypt
-
ALIGN 64
_bsaes_encrypt8:
@@ -1080,17 +1077,13 @@ DB 102,15,56,0,244
DB 0F3h,0C3h ;repret
-EXTERN aes_nohw_cbc_encrypt
global bsaes_cbc_encrypt
ALIGN 16
bsaes_cbc_encrypt:
- mov r11d,DWORD[48+rsp]
- cmp r11d,0
- jne NEAR aes_nohw_cbc_encrypt
- cmp r8,128
- jb NEAR aes_nohw_cbc_encrypt
+
+
mov rax,rsp
$L$cbc_dec_prologue:
@@ -1146,6 +1139,8 @@ $L$cbc_dec_body:
movdqu xmm14,XMMWORD[rbx]
sub r14,8
+ jc NEAR $L$cbc_dec_loop_done
+
$L$cbc_dec_loop:
movdqu xmm15,XMMWORD[r12]
movdqu xmm0,XMMWORD[16+r12]
@@ -1190,6 +1185,7 @@ $L$cbc_dec_loop:
sub r14,8
jnc NEAR $L$cbc_dec_loop
+$L$cbc_dec_loop_done:
add r14,8
jz NEAR $L$cbc_dec_done
@@ -1322,13 +1318,12 @@ $L$cbc_dec_two:
jmp NEAR $L$cbc_dec_done
ALIGN 16
$L$cbc_dec_one:
- lea rcx,[r12]
- lea rdx,[32+rbp]
- lea r8,[r15]
- call aes_nohw_decrypt
- pxor xmm14,XMMWORD[32+rbp]
- movdqu XMMWORD[r13],xmm14
- movdqa xmm14,xmm15
+ movdqa XMMWORD[32+rbp],xmm14
+ call _bsaes_decrypt8
+ pxor xmm15,XMMWORD[32+rbp]
+ movdqu xmm14,XMMWORD[r12]
+ movdqu XMMWORD[r13],xmm15
+ jmp NEAR $L$cbc_dec_done
$L$cbc_dec_done:
movdqu XMMWORD[rbx],xmm14
@@ -1423,8 +1418,8 @@ $L$ctr_enc_body:
mov r14,r8
mov r15,r9
movdqa XMMWORD[32+rbp],xmm0
- cmp r8,8
- jb NEAR $L$ctr_enc_short
+
+
mov ebx,eax
shl rax,7
@@ -1558,26 +1553,8 @@ $L$ctr_enc_loop_done:
movdqu xmm13,XMMWORD[96+r12]
pxor xmm1,xmm13
movdqu XMMWORD[96+r13],xmm1
- jmp NEAR $L$ctr_enc_done
-ALIGN 16
-$L$ctr_enc_short:
- lea rcx,[32+rbp]
- lea rdx,[48+rbp]
- lea r8,[r15]
- call aes_nohw_encrypt
- movdqu xmm0,XMMWORD[r12]
- lea r12,[16+r12]
- mov eax,DWORD[44+rbp]
- bswap eax
- pxor xmm0,XMMWORD[48+rbp]
- inc eax
- movdqu XMMWORD[r13],xmm0
- bswap eax
- lea r13,[16+r13]
- mov DWORD[44+rsp],eax
- dec r14
- jnz NEAR $L$ctr_enc_short
+
$L$ctr_enc_done:
lea rax,[rsp]