external/boringssl: Sync to fdb48f98612e934eab339b4871484b1c987553e2. am: 9d5d1a76eb am: d54d28eca9

am: bdfba2a0b5 Change-Id: I13d1010c30643e3ad126ddadcac5f1eea83087f4
author: Robert Sloan <varomodt@google.com> 2019-03-19 02:02:05 -0700
committer: android-build-merger <android-build-merger@google.com> 2019-03-19 02:02:05 -0700
commit: 767904931a5f7012915cf015d54ca571dfb86e03 (patch)
tree: d5956e0da0ddbeb7e907378720fcbc8c6926beee
parent: 8c9200ba9943ec79d6e957b2893f9a1455208778 (diff)
parent: bdfba2a0b5cfa78c35c71b35bd385a9acfc3ec14 (diff)
download: boringssl-767904931a5f7012915cf015d54ca571dfb86e03.tar.gz
19 files changed, 1118 insertions, 437 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index c10534e3..42ad2f07 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-35941f2923155664bd9fa5d897cb336a0ab729a1
+fdb48f98612e934eab339b4871484b1c987553e2
diff --git a/eureka.mk b/eureka.mk
index b0f09c94..43de1776 100644
--- a/eureka.mk
+++ b/eureka.mk
@@ -298,6 +298,7 @@ linux_aarch64_sources := \
   linux-aarch64/crypto/chacha/chacha-armv8.S\
   linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\
   linux-aarch64/crypto/fipsmodule/armv8-mont.S\
+  linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\
   linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\
   linux-aarch64/crypto/fipsmodule/sha1-armv8.S\
   linux-aarch64/crypto/fipsmodule/sha256-armv8.S\
diff --git a/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
new file mode 100644
index 00000000..62bdc9a8
--- /dev/null
+++ b/ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,337 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.globl	_gcm_init_neon
+.private_extern	_gcm_init_neon
+
+.align	4
+_gcm_init_neon:
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	_gcm_gmult_neon
+.private_extern	_gcm_gmult_neon
+
+.align	4
+_gcm_gmult_neon:
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	_gcm_ghash_neon
+.private_extern	_gcm_ghash_neon
+
+.align	4
+_gcm_ghash_neon:
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	__TEXT,__const
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM
diff --git a/ios-arm/crypto/fipsmodule/bsaes-armv7.S b/ios-arm/crypto/fipsmodule/bsaes-armv7.S
index 4d4b7cc7..dffc0c24 100644
--- a/ios-arm/crypto/fipsmodule/bsaes-armv7.S
+++ b/ios-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -1086,12 +1086,6 @@ Lkey_loop:
 	@ don't save last round key
 	bx	lr
 
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-
-
-
 .globl	_bsaes_cbc_encrypt
 .private_extern	_bsaes_cbc_encrypt
 #ifdef __thumb2__
@@ -1099,16 +1093,8 @@ Lkey_loop:
 #endif
 .align	5
 _bsaes_cbc_encrypt:
-#ifndef	__KERNEL__
-	cmp	r2, #128
-#ifndef	__thumb__
-	blo	_AES_cbc_encrypt
-#else
-	bhs	1f
-	b	_AES_cbc_encrypt
-1:
-#endif
-#endif
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
 
 	@ it is up to the caller to make sure we are called with enc == 0
 
@@ -1206,10 +1192,7 @@ Lcbc_dec_loop_finish:
 	adds	r2, r2, #8
 	beq	Lcbc_dec_done
 
-	vld1.8	{q0}, [r0]!		@ load input
-	cmp	r2, #2
-	blo	Lcbc_dec_one
-	vld1.8	{q1}, [r0]!
+	@ Set up most parameters for the _bsaes_decrypt8 call.
 #ifndef	BSAES_ASM_EXTENDED_KEY
 	mov	r4, sp			@ pass the key
 #else
@@ -1217,6 +1200,11 @@ Lcbc_dec_loop_finish:
 #endif
 	mov	r5, r10
 	vstmia	r9, {q15}			@ put aside IV
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
 	beq	Lcbc_dec_two
 	vld1.8	{q2}, [r0]!
 	cmp	r2, #4
@@ -1334,16 +1322,11 @@ Lcbc_dec_two:
 .align	4
 Lcbc_dec_one:
 	sub	r0, r0, #0x10
-	mov	r10, r1			@ save original out pointer
-	mov	r1, r9			@ use the iv scratch space as out buffer
-	mov	r2, r3
-	vmov	q4,q15		@ just in case ensure that IV
-	vmov	q5,q0			@ and input are preserved
-	bl	_aes_nohw_decrypt
-	vld1.8	{q0}, [r9]		@ load result
-	veor	q0, q0, q4	@ ^= IV
-	vmov	q15, q5		@ q5 holds input
-	vst1.8	{q0}, [r10]		@ write output
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vst1.8	{q0}, [r1]!		@ write output
 
 Lcbc_dec_done:
 #ifndef	BSAES_ASM_EXTENDED_KEY
@@ -1361,7 +1344,6 @@ Lcbc_dec_bzero:@ wipe key schedule [if any]
 	VFP_ABI_POP
 	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
 
-
 .globl	_bsaes_ctr32_encrypt_blocks
 .private_extern	_bsaes_ctr32_encrypt_blocks
 #ifdef __thumb2__
@@ -1369,9 +1351,8 @@ Lcbc_dec_bzero:@ wipe key schedule [if any]
 #endif
 .align	5
 _bsaes_ctr32_encrypt_blocks:
-	cmp	r2, #8			@ use plain AES for
-	blo	Lctr_enc_short			@ small sizes
-
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
 	mov	ip, sp
 	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
 	VFP_ABI_PUSH
@@ -1547,50 +1528,8 @@ Lctr_enc_bzero:@ wipe key schedule [if any]
 	VFP_ABI_POP
 	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
 
-.align	4
-Lctr_enc_short:
-	ldr	ip, [sp]		@ ctr pointer is passed on stack
-	stmdb	sp!, {r4,r5,r6,r7,r8, lr}
-
-	mov	r4, r0		@ copy arguments
-	mov	r5, r1
-	mov	r6, r2
-	mov	r7, r3
-	ldr	r8, [ip, #12]		@ load counter LSW
-	vld1.8	{q1}, [ip]		@ load whole counter value
-#ifdef __ARMEL__
-	rev	r8, r8
-#endif
-	sub	sp, sp, #0x10
-	vst1.8	{q1}, [sp]		@ copy counter value
-	sub	sp, sp, #0x10
-
-Lctr_enc_short_loop:
-	add	r0, sp, #0x10		@ input counter value
-	mov	r1, sp			@ output on the stack
-	mov	r2, r7			@ key
-
-	bl	_aes_nohw_encrypt
-
-	vld1.8	{q0}, [r4]!	@ load input
-	vld1.8	{q1}, [sp]		@ load encrypted counter
-	add	r8, r8, #1
-#ifdef __ARMEL__
-	rev	r0, r8
-	str	r0, [sp, #0x1c]		@ next counter value
-#else
-	str	r8, [sp, #0x1c]		@ next counter value
-#endif
-	veor	q0,q0,q1
-	vst1.8	{q0}, [r5]!	@ store output
-	subs	r6, r6, #1
-	bne	Lctr_enc_short_loop
-
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-	vstmia	sp!, {q0,q1}
-
-	ldmia	sp!, {r4,r5,r6,r7,r8, pc}
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
 
 #endif
 #endif  // !OPENSSL_NO_ASM
diff --git a/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
new file mode 100644
index 00000000..1cfbec29
--- /dev/null
+++ b/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,339 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.globl	gcm_init_neon
+.hidden	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.globl	gcm_gmult_neon
+.hidden	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl	gcm_ghash_neon
+.hidden	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, .Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	.Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/linux-arm/crypto/fipsmodule/bsaes-armv7.S
index 20b9bb05..0ad56bc7 100644
--- a/linux-arm/crypto/fipsmodule/bsaes-armv7.S
+++ b/linux-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -1081,27 +1081,13 @@ _bsaes_key_convert:
 	@ don't save last round key
 	bx	lr
 .size	_bsaes_key_convert,.-_bsaes_key_convert
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-
-
-
 .globl	bsaes_cbc_encrypt
 .hidden	bsaes_cbc_encrypt
 .type	bsaes_cbc_encrypt,%function
 .align	5
 bsaes_cbc_encrypt:
-#ifndef	__KERNEL__
-	cmp	r2, #128
-#ifndef	__thumb__
-	blo	AES_cbc_encrypt
-#else
-	bhs	1f
-	b	AES_cbc_encrypt
-1:
-#endif
-#endif
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
 
 	@ it is up to the caller to make sure we are called with enc == 0
 
@@ -1199,10 +1185,7 @@ bsaes_cbc_encrypt:
 	adds	r2, r2, #8
 	beq	.Lcbc_dec_done
 
-	vld1.8	{q0}, [r0]!		@ load input
-	cmp	r2, #2
-	blo	.Lcbc_dec_one
-	vld1.8	{q1}, [r0]!
+	@ Set up most parameters for the _bsaes_decrypt8 call.
 #ifndef	BSAES_ASM_EXTENDED_KEY
 	mov	r4, sp			@ pass the key
 #else
@@ -1210,6 +1193,11 @@ bsaes_cbc_encrypt:
 #endif
 	mov	r5, r10
 	vstmia	r9, {q15}			@ put aside IV
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
 	beq	.Lcbc_dec_two
 	vld1.8	{q2}, [r0]!
 	cmp	r2, #4
@@ -1327,16 +1315,11 @@ bsaes_cbc_encrypt:
 .align	4
 .Lcbc_dec_one:
 	sub	r0, r0, #0x10
-	mov	r10, r1			@ save original out pointer
-	mov	r1, r9			@ use the iv scratch space as out buffer
-	mov	r2, r3
-	vmov	q4,q15		@ just in case ensure that IV
-	vmov	q5,q0			@ and input are preserved
-	bl	aes_nohw_decrypt
-	vld1.8	{q0}, [r9]		@ load result
-	veor	q0, q0, q4	@ ^= IV
-	vmov	q15, q5		@ q5 holds input
-	vst1.8	{q0}, [r10]		@ write output
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vst1.8	{q0}, [r1]!		@ write output
 
 .Lcbc_dec_done:
 #ifndef	BSAES_ASM_EXTENDED_KEY
@@ -1354,15 +1337,13 @@ bsaes_cbc_encrypt:
 	VFP_ABI_POP
 	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
 .size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-
 .globl	bsaes_ctr32_encrypt_blocks
 .hidden	bsaes_ctr32_encrypt_blocks
 .type	bsaes_ctr32_encrypt_blocks,%function
 .align	5
 bsaes_ctr32_encrypt_blocks:
-	cmp	r2, #8			@ use plain AES for
-	blo	.Lctr_enc_short			@ small sizes
-
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
 	mov	ip, sp
 	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
 	VFP_ABI_PUSH
@@ -1538,50 +1519,8 @@ bsaes_ctr32_encrypt_blocks:
 	VFP_ABI_POP
 	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
 
-.align	4
-.Lctr_enc_short:
-	ldr	ip, [sp]		@ ctr pointer is passed on stack
-	stmdb	sp!, {r4,r5,r6,r7,r8, lr}
-
-	mov	r4, r0		@ copy arguments
-	mov	r5, r1
-	mov	r6, r2
-	mov	r7, r3
-	ldr	r8, [ip, #12]		@ load counter .LSW
-	vld1.8	{q1}, [ip]		@ load whole counter value
-#ifdef __ARMEL__
-	rev	r8, r8
-#endif
-	sub	sp, sp, #0x10
-	vst1.8	{q1}, [sp]		@ copy counter value
-	sub	sp, sp, #0x10
-
-.Lctr_enc_short_loop:
-	add	r0, sp, #0x10		@ input counter value
-	mov	r1, sp			@ output on the stack
-	mov	r2, r7			@ key
-
-	bl	aes_nohw_encrypt
-
-	vld1.8	{q0}, [r4]!	@ load input
-	vld1.8	{q1}, [sp]		@ load encrypted counter
-	add	r8, r8, #1
-#ifdef __ARMEL__
-	rev	r0, r8
-	str	r0, [sp, #0x1c]		@ next counter value
-#else
-	str	r8, [sp, #0x1c]		@ next counter value
-#endif
-	veor	q0,q0,q1
-	vst1.8	{q0}, [r5]!	@ store output
-	subs	r6, r6, #1
-	bne	.Lctr_enc_short_loop
-
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-	vstmia	sp!, {q0,q1}
-
-	ldmia	sp!, {r4,r5,r6,r7,r8, pc}
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
 #endif
 #endif
diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index 5236aa66..5437762f 100644
--- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -13,11 +13,6 @@
 #endif
 .text	
 
-.extern	aes_nohw_encrypt
-.hidden aes_nohw_encrypt
-.extern	aes_nohw_decrypt
-.hidden aes_nohw_decrypt
-
 .type	_bsaes_encrypt8,@function
 .align	64
 _bsaes_encrypt8:
@@ -1083,18 +1078,14 @@ _bsaes_key_convert:
 	.byte	0xf3,0xc3
 .cfi_endproc	
 .size	_bsaes_key_convert,.-_bsaes_key_convert
-.extern	aes_nohw_cbc_encrypt
-.hidden aes_nohw_cbc_encrypt
 .globl	bsaes_cbc_encrypt
 .hidden bsaes_cbc_encrypt
 .type	bsaes_cbc_encrypt,@function
 .align	16
 bsaes_cbc_encrypt:
 .cfi_startproc	
-	cmpl	$0,%r9d
-	jne	aes_nohw_cbc_encrypt
-	cmpq	$128,%rdx
-	jb	aes_nohw_cbc_encrypt
+
+
 
 	movq	%rsp,%rax
 .Lcbc_dec_prologue:
@@ -1143,6 +1134,8 @@ bsaes_cbc_encrypt:
 
 	movdqu	(%rbx),%xmm14
 	subq	$8,%r14
+	jc	.Lcbc_dec_loop_done
+
 .Lcbc_dec_loop:
 	movdqu	0(%r12),%xmm15
 	movdqu	16(%r12),%xmm0
@@ -1187,6 +1180,7 @@ bsaes_cbc_encrypt:
 	subq	$8,%r14
 	jnc	.Lcbc_dec_loop
 
+.Lcbc_dec_loop_done:
 	addq	$8,%r14
 	jz	.Lcbc_dec_done
 
@@ -1319,13 +1313,12 @@ bsaes_cbc_encrypt:
 	jmp	.Lcbc_dec_done
 .align	16
 .Lcbc_dec_one:
-	leaq	(%r12),%rdi
-	leaq	32(%rbp),%rsi
-	leaq	(%r15),%rdx
-	call	aes_nohw_decrypt
-	pxor	32(%rbp),%xmm14
-	movdqu	%xmm14,(%r13)
-	movdqa	%xmm15,%xmm14
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm14
+	movdqu	%xmm15,0(%r13)
+	jmp	.Lcbc_dec_done
 
 .Lcbc_dec_done:
 	movdqu	%xmm14,(%rbx)
@@ -1403,8 +1396,8 @@ bsaes_ctr32_encrypt_blocks:
 	movq	%rdx,%r14
 	movq	%rcx,%r15
 	movdqa	%xmm0,32(%rbp)
-	cmpq	$8,%rdx
-	jb	.Lctr_enc_short
+
+
 
 	movl	%eax,%ebx
 	shlq	$7,%rax
@@ -1538,26 +1531,8 @@ bsaes_ctr32_encrypt_blocks:
 	movdqu	96(%r12),%xmm13
 	pxor	%xmm13,%xmm1
 	movdqu	%xmm1,96(%r13)
-	jmp	.Lctr_enc_done
 
-.align	16
-.Lctr_enc_short:
-	leaq	32(%rbp),%rdi
-	leaq	48(%rbp),%rsi
-	leaq	(%r15),%rdx
-	call	aes_nohw_encrypt
-	movdqu	(%r12),%xmm0
-	leaq	16(%r12),%r12
-	movl	44(%rbp),%eax
-	bswapl	%eax
-	pxor	48(%rbp),%xmm0
-	incl	%eax
-	movdqu	%xmm0,(%r13)
-	bswapl	%eax
-	leaq	16(%r13),%r13
-	movl	%eax,44(%rsp)
-	decq	%r14
-	jnz	.Lctr_enc_short
+
 
 .Lctr_enc_done:
 	leaq	(%rsp),%rax
diff --git a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index 5a65960d..c2807e38 100644
--- a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -14,9 +14,6 @@
 .text	
 
 
-
-
-
 .p2align	6
 _bsaes_encrypt8:
 
@@ -1081,17 +1078,14 @@ L$key_loop:
 	.byte	0xf3,0xc3
 
 
-
 .globl	_bsaes_cbc_encrypt
 .private_extern _bsaes_cbc_encrypt
 
 .p2align	4
 _bsaes_cbc_encrypt:
 
-	cmpl	$0,%r9d
-	jne	_aes_nohw_cbc_encrypt
-	cmpq	$128,%rdx
-	jb	_aes_nohw_cbc_encrypt
+
+
 
 	movq	%rsp,%rax
 L$cbc_dec_prologue:
@@ -1134,6 +1128,8 @@ L$cbc_dec_prologue:
 
 	movdqu	(%rbx),%xmm14
 	subq	$8,%r14
+	jc	L$cbc_dec_loop_done
+
 L$cbc_dec_loop:
 	movdqu	0(%r12),%xmm15
 	movdqu	16(%r12),%xmm0
@@ -1178,6 +1174,7 @@ L$cbc_dec_loop:
 	subq	$8,%r14
 	jnc	L$cbc_dec_loop
 
+L$cbc_dec_loop_done:
 	addq	$8,%r14
 	jz	L$cbc_dec_done
 
@@ -1310,13 +1307,12 @@ L$cbc_dec_two:
 	jmp	L$cbc_dec_done
 .p2align	4
 L$cbc_dec_one:
-	leaq	(%r12),%rdi
-	leaq	32(%rbp),%rsi
-	leaq	(%r15),%rdx
-	call	_aes_nohw_decrypt
-	pxor	32(%rbp),%xmm14
-	movdqu	%xmm14,(%r13)
-	movdqa	%xmm15,%xmm14
+	movdqa	%xmm14,32(%rbp)
+	call	_bsaes_decrypt8
+	pxor	32(%rbp),%xmm15
+	movdqu	0(%r12),%xmm14
+	movdqu	%xmm15,0(%r13)
+	jmp	L$cbc_dec_done
 
 L$cbc_dec_done:
 	movdqu	%xmm14,(%rbx)
@@ -1387,8 +1383,8 @@ L$ctr_enc_prologue:
 	movq	%rdx,%r14
 	movq	%rcx,%r15
 	movdqa	%xmm0,32(%rbp)
-	cmpq	$8,%rdx
-	jb	L$ctr_enc_short
+
+
 
 	movl	%eax,%ebx
 	shlq	$7,%rax
@@ -1522,26 +1518,8 @@ L$ctr_enc_loop_done:
 	movdqu	96(%r12),%xmm13
 	pxor	%xmm13,%xmm1
 	movdqu	%xmm1,96(%r13)
-	jmp	L$ctr_enc_done
 
-.p2align	4
-L$ctr_enc_short:
-	leaq	32(%rbp),%rdi
-	leaq	48(%rbp),%rsi
-	leaq	(%r15),%rdx
-	call	_aes_nohw_encrypt
-	movdqu	(%r12),%xmm0
-	leaq	16(%r12),%r12
-	movl	44(%rbp),%eax
-	bswapl	%eax
-	pxor	48(%rbp),%xmm0
-	incl	%eax
-	movdqu	%xmm0,(%r13)
-	bswapl	%eax
-	leaq	16(%r13),%r13
-	movl	%eax,44(%rsp)
-	decq	%r14
-	jnz	L$ctr_enc_short
+
 
 L$ctr_enc_done:
 	leaq	(%rsp),%rax
diff --git a/sources.bp b/sources.bp
index d704e3c5..25e406fa 100644
--- a/sources.bp
+++ b/sources.bp
@@ -247,6 +247,7 @@ cc_defaults {
                 "linux-aarch64/crypto/chacha/chacha-armv8.S",
                 "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S",
                 "linux-aarch64/crypto/fipsmodule/armv8-mont.S",
+                "linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
                 "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
                 "linux-aarch64/crypto/fipsmodule/sha1-armv8.S",
                 "linux-aarch64/crypto/fipsmodule/sha256-armv8.S",
diff --git a/sources.mk b/sources.mk
index fb0679ba..617ac2ad 100644
--- a/sources.mk
+++ b/sources.mk
@@ -243,6 +243,7 @@ linux_aarch64_sources := \
   linux-aarch64/crypto/chacha/chacha-armv8.S\
   linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\
   linux-aarch64/crypto/fipsmodule/armv8-mont.S\
+  linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\
   linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\
   linux-aarch64/crypto/fipsmodule/sha1-armv8.S\
   linux-aarch64/crypto/fipsmodule/sha256-armv8.S\
diff --git a/src/crypto/fipsmodule/CMakeLists.txt b/src/crypto/fipsmodule/CMakeLists.txt
index 09d210bf..fbf25ac8 100644
--- a/src/crypto/fipsmodule/CMakeLists.txt
+++ b/src/crypto/fipsmodule/CMakeLists.txt
@@ -65,6 +65,7 @@ if(${ARCH} STREQUAL "aarch64")
 
     aesv8-armx.${ASM_EXT}
     armv8-mont.${ASM_EXT}
+    ghash-neon-armv8.${ASM_EXT}
     ghashv8-armx.${ASM_EXT}
     sha1-armv8.${ASM_EXT}
     sha256-armv8.${ASM_EXT}
@@ -99,6 +100,7 @@ perlasm(co-586.${ASM_EXT} bn/asm/co-586.pl)
 perlasm(ghash-armv4.${ASM_EXT} modes/asm/ghash-armv4.pl)
 perlasm(ghashp8-ppc.${ASM_EXT} modes/asm/ghashp8-ppc.pl)
 perlasm(ghashv8-armx.${ASM_EXT} modes/asm/ghashv8-armx.pl)
+perlasm(ghash-neon-armv8.${ASM_EXT} modes/asm/ghash-neon-armv8.pl)
 perlasm(ghash-ssse3-x86_64.${ASM_EXT} modes/asm/ghash-ssse3-x86_64.pl)
 perlasm(ghash-ssse3-x86.${ASM_EXT} modes/asm/ghash-ssse3-x86.pl)
 perlasm(ghash-x86_64.${ASM_EXT} modes/asm/ghash-x86_64.pl)
diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl b/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
index 11607d11..d4db3b4d 100644
--- a/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
+++ b/src/crypto/fipsmodule/aes/asm/bsaes-armv7.pl
@@ -1113,26 +1113,12 @@ my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
 my ($keysched)=("sp");
 
 $code.=<<___;
-@ TODO(davidben): This should be aes_nohw_cbc_encrypt, but that function does
-@ not exist. Rather than add it, patch this fallback out. See
-@ https://crbug.com/boringssl/256.
-.extern AES_cbc_encrypt
-.extern aes_nohw_decrypt
-
 .global	bsaes_cbc_encrypt
 .type	bsaes_cbc_encrypt,%function
 .align	5
 bsaes_cbc_encrypt:
-#ifndef	__KERNEL__
-	cmp	$len, #128
-#ifndef	__thumb__
-	blo	AES_cbc_encrypt
-#else
-	bhs	1f
-	b	AES_cbc_encrypt
-1:
-#endif
-#endif
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
 
 	@ it is up to the caller to make sure we are called with enc == 0
 
@@ -1230,10 +1216,7 @@ bsaes_cbc_encrypt:
 	adds	$len, $len, #8
 	beq	.Lcbc_dec_done
 
-	vld1.8	{@XMM[0]}, [$inp]!		@ load input
-	cmp	$len, #2
-	blo	.Lcbc_dec_one
-	vld1.8	{@XMM[1]}, [$inp]!
+	@ Set up most parameters for the _bsaes_decrypt8 call.
 #ifndef	BSAES_ASM_EXTENDED_KEY
 	mov	r4, $keysched			@ pass the key
 #else
@@ -1241,6 +1224,11 @@ bsaes_cbc_encrypt:
 #endif
 	mov	r5, $rounds
 	vstmia	$fp, {@XMM[15]}			@ put aside IV
+
+	vld1.8	{@XMM[0]}, [$inp]!		@ load input
+	cmp	$len, #2
+	blo	.Lcbc_dec_one
+	vld1.8	{@XMM[1]}, [$inp]!
 	beq	.Lcbc_dec_two
 	vld1.8	{@XMM[2]}, [$inp]!
 	cmp	$len, #4
@@ -1358,16 +1346,11 @@ bsaes_cbc_encrypt:
 .align	4
 .Lcbc_dec_one:
 	sub	$inp, $inp, #0x10
-	mov	$rounds, $out			@ save original out pointer
-	mov	$out, $fp			@ use the iv scratch space as out buffer
-	mov	r2, $key
-	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
-	vmov	@XMM[5],@XMM[0]			@ and input are preserved
-	bl	aes_nohw_decrypt
-	vld1.8	{@XMM[0]}, [$fp]		@ load result
-	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
-	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
-	vst1.8	{@XMM[0]}, [$rounds]		@ write output
+	bl	_bsaes_decrypt8
+	vldmia	$fp, {@XMM[14]}			@ reload IV
+	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
+	vst1.8	{@XMM[0]}, [$out]!		@ write output
 
 .Lcbc_dec_done:
 #ifndef	BSAES_ASM_EXTENDED_KEY
@@ -1393,14 +1376,12 @@ my $const = "r6";	# shared with _bsaes_encrypt8_alt
 my $keysched = "sp";
 
 $code.=<<___;
-.extern	aes_nohw_encrypt
 .global	bsaes_ctr32_encrypt_blocks
 .type	bsaes_ctr32_encrypt_blocks,%function
 .align	5
 bsaes_ctr32_encrypt_blocks:
-	cmp	$len, #8			@ use plain AES for
-	blo	.Lctr_enc_short			@ small sizes
-
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
 	mov	ip, sp
 	stmdb	sp!, {r4-r10, lr}
 	VFP_ABI_PUSH
@@ -1576,50 +1557,8 @@ bsaes_ctr32_encrypt_blocks:
 	VFP_ABI_POP
 	ldmia	sp!, {r4-r10, pc}	@ return
 
-.align	4
-.Lctr_enc_short:
-	ldr	ip, [sp]		@ ctr pointer is passed on stack
-	stmdb	sp!, {r4-r8, lr}
-
-	mov	r4, $inp		@ copy arguments
-	mov	r5, $out
-	mov	r6, $len
-	mov	r7, $key
-	ldr	r8, [ip, #12]		@ load counter LSW
-	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
-#ifdef __ARMEL__
-	rev	r8, r8
-#endif
-	sub	sp, sp, #0x10
-	vst1.8	{@XMM[1]}, [sp]		@ copy counter value
-	sub	sp, sp, #0x10
-
-.Lctr_enc_short_loop:
-	add	r0, sp, #0x10		@ input counter value
-	mov	r1, sp			@ output on the stack
-	mov	r2, r7			@ key
-
-	bl	aes_nohw_encrypt
-
-	vld1.8	{@XMM[0]}, [r4]!	@ load input
-	vld1.8	{@XMM[1]}, [sp]		@ load encrypted counter
-	add	r8, r8, #1
-#ifdef __ARMEL__
-	rev	r0, r8
-	str	r0, [sp, #0x1c]		@ next counter value
-#else
-	str	r8, [sp, #0x1c]		@ next counter value
-#endif
-	veor	@XMM[0],@XMM[0],@XMM[1]
-	vst1.8	{@XMM[0]}, [r5]!	@ store output
-	subs	r6, r6, #1
-	bne	.Lctr_enc_short_loop
-
-	vmov.i32	q0, #0
-	vmov.i32	q1, #0
-	vstmia		sp!, {q0-q1}
-
-	ldmia	sp!, {r4-r8, pc}
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
 ___
 }
diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
index 81331bfa..3bb28190 100644
--- a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
+++ b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
@@ -811,9 +811,6 @@ ___
 $code.=<<___;
 .text
 
-.extern	aes_nohw_encrypt
-.extern	aes_nohw_decrypt
-
 .type	_bsaes_encrypt8,\@abi-omnipotent
 .align	64
 _bsaes_encrypt8:
@@ -1609,22 +1606,14 @@ $code.=<<___;
 ___
 }
 $code.=<<___;
-.extern	aes_nohw_cbc_encrypt
 .globl	bsaes_cbc_encrypt
 .type	bsaes_cbc_encrypt,\@abi-omnipotent
 .align	16
 bsaes_cbc_encrypt:
 .cfi_startproc
-___
-$code.=<<___ if ($win64);
-	mov	48(%rsp),$arg6		# pull direction flag
-___
-$code.=<<___;
-	cmp	\$0,$arg6
-	jne	aes_nohw_cbc_encrypt
-	cmp	\$128,$arg3
-	jb	aes_nohw_cbc_encrypt
-
+	# In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	# short inputs or if enc is one. We patch this out, using bsaes for all
+	# input sizes. The caller is required to ensure enc is zero.
 	mov	%rsp, %rax
 .Lcbc_dec_prologue:
 	push	%rbp
@@ -1683,6 +1672,8 @@ $code.=<<___;
 
 	movdqu	(%rbx), @XMM[15]	# load IV
 	sub	\$8,$len
+	jc	.Lcbc_dec_loop_done
+
 .Lcbc_dec_loop:
 	movdqu	0x00($inp), @XMM[0]	# load input
 	movdqu	0x10($inp), @XMM[1]
@@ -1727,6 +1718,7 @@ $code.=<<___;
 	sub	\$8,$len
 	jnc	.Lcbc_dec_loop
 
+.Lcbc_dec_loop_done:
 	add	\$8,$len
 	jz	.Lcbc_dec_done
 
@@ -1859,13 +1851,12 @@ $code.=<<___;
 	jmp	.Lcbc_dec_done
 .align	16
 .Lcbc_dec_one:
-	lea	($inp), $arg1
-	lea	0x20(%rbp), $arg2	# buffer output
-	lea	($key), $arg3
-	call	aes_nohw_decrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[15]	# ^= IV
-	movdqu	@XMM[15], ($out)	# write output
-	movdqa	@XMM[0], @XMM[15]	# IV
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[15]	# IV
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lcbc_dec_done
 
 .Lcbc_dec_done:
 	movdqu	@XMM[15], (%rbx)	# return IV
@@ -1968,8 +1959,8 @@ $code.=<<___;
 	mov	$arg3, $len
 	mov	$arg4, $key
 	movdqa	%xmm0, 0x20(%rbp)	# copy counter
-	cmp	\$8, $arg3
-	jb	.Lctr_enc_short
+	# In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	# out to retain a constant-time implementation.
 
 	mov	%eax, %ebx		# rounds
 	shl	\$7, %rax		# 128 bytes per inner round key
@@ -2103,27 +2094,9 @@ $code.=<<___;
 	movdqu	0x60($inp), @XMM[14]
 	pxor	@XMM[14], @XMM[2]
 	movdqu	@XMM[2], 0x60($out)
-	jmp	.Lctr_enc_done
-
-.align	16
-.Lctr_enc_short:
-	lea	0x20(%rbp), $arg1
-	lea	0x30(%rbp), $arg2
-	lea	($key), $arg3
-	call	aes_nohw_encrypt
-	movdqu	($inp), @XMM[1]
-	lea	16($inp), $inp
-	mov	0x2c(%rbp), %eax	# load 32-bit counter
-	bswap	%eax
-	pxor	0x30(%rbp), @XMM[1]
-	inc	%eax			# increment
-	movdqu	@XMM[1], ($out)
-	bswap	%eax
-	lea	16($out), $out
-	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
-	dec	$len
-	jnz	.Lctr_enc_short
 
+	# OpenSSL contains aes_nohw_* fallback code here. We patch this
+	# out to retain a constant-time implementation.
 .Lctr_enc_done:
 	lea	(%rsp), %rax
 	pxor	%xmm0, %xmm0
diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h
index a05abcbf..63070bc6 100644
--- a/src/crypto/fipsmodule/aes/internal.h
+++ b/src/crypto/fipsmodule/aes/internal.h
@@ -133,7 +133,7 @@ void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length,
 
 #if defined(BSAES)
 // On platforms where BSAES gets defined (just above), then these functions are
-// provided by asm.
+// provided by asm. Note |bsaes_cbc_encrypt| requires |enc| to be zero.
 void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length,
                        const AES_KEY *key, uint8_t ivec[16], int enc);
 void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c
index 51a1fb1c..a1859d74 100644
--- a/src/crypto/fipsmodule/cipher/e_aes.c
+++ b/src/crypto/fipsmodule/cipher/e_aes.c
@@ -111,7 +111,8 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
       }
     } else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) {
       ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
-      dat->block = aes_nohw_decrypt;
+      // If |dat->stream.cbc| is provided, |dat->block| is never used.
+      dat->block = NULL;
       dat->stream.cbc = bsaes_cbc_encrypt;
     } else if (vpaes_capable()) {
       ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
@@ -138,7 +139,8 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
     }
   } else if (bsaes_capable() && mode == EVP_CIPH_CTR_MODE) {
     ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
-    dat->block = aes_nohw_encrypt;
+    // If |dat->stream.ctr| is provided, |dat->block| is never used.
+    dat->block = NULL;
     dat->stream.ctr = bsaes_ctr32_encrypt_blocks;
   } else if (vpaes_capable()) {
     ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks);
diff --git a/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
new file mode 100644
index 00000000..972be419
--- /dev/null
+++ b/src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,287 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
+# implements the multiplication algorithm described in:
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+#
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
+# vN. Where the 32-bit version would use the upper half, this file must keep
+# halves in separate registers.
+#
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
+#
+#     vshl.i64 q0, q0, #1
+#
+# in 64-bit, it would be written:
+#
+#     shl v0.2d, v0.2d, #1
+#
+# See Programmer's Guide for ARMv8-A, section 7 for details.
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
+#
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
+# polynomial and is conditioned on the PMULL extension. This file emulates the
+# latter with the former.
+
+use strict;
+
+my $flavour = shift;
+my $output;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
+    my $dir = $1;
+    my $xlate;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
+# to spare.
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
+
+my $code = "";
+
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
+sub clmul64x64 {
+my ($r, $a, $b) = @_;
+$code .= <<___;
+	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
+	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
+	ext	$r.8b, $b.8b, $b.8b, #1		// B1
+	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
+	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
+	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
+	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
+	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
+	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
+	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
+	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
+	ext	$r.8b, $b.8b, $b.8b, #3		// B3
+	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
+	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	\$t0#hi, \$t0#hi, \$k48
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
+	//
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	\$t1#hi, \$t1#hi, \$k32
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
+	//
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	\$t2#hi, \$t2#hi, \$k16
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
+	//
+	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	\$t3#hi, #0
+	//
+	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
+	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
+	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
+	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
+	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
+	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
+	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
+	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
+
+	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
+	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
+	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
+	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
+	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
+	eor	$t0.16b, $t0.16b, $t1.16b
+	eor	$t2.16b, $t2.16b, $t3.16b
+	eor	$r.16b, $r.16b, $t0.16b
+	eor	$r.16b, $r.16b, $t2.16b
+___
+}
+
+$code .= <<___;
+.text
+
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{$t1.2d}, [x1]			// load H
+	movi	$t3.16b, #0xe1
+	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
+	ext	$INlo.16b, $t1.16b, $t1.16b, #8
+	ushr	$t2.2d, $t3.2d, #63
+	dup	$t1.4s, $t1.s[1]
+	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
+	ushr	$t2.2d, $INlo.2d, #63
+	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
+	and	$t2.16b, $t2.16b, $t0.16b
+	shl	$INlo.2d, $INlo.2d, #1
+	ext	$t2.16b, $t2.16b, $t2.16b, #8
+	and	$t0.16b, $t0.16b, $t1.16b
+	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
+	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
+	st1	{$Hlo.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	ld1	{$INlo.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+	mov	$len, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	ld1	{$Xl.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{$INlo.16b}, [$inp], #16	// load inp
+	rev64	$INlo.16b, $INlo.16b		// byteswap inp
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into $INlo and $INhi. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	$INhi.d[0], $INlo.d[1]
+___
+&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
+$code .= <<___;
+	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
+___
+&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
+$code .= <<___;
+	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
+	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
+	eor	$Xm.16b, $Xm.16b, $Xh.16b
+	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	$Xh.d[0], $Xm.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	$t1.2d, $Xl.2d, #57		// 1st phase
+	shl	$t2.2d, $Xl.2d, #62
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	shl	$t1.2d, $Xl.2d, #63
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	$t2.16b, $t2.16b, $Xm.16b
+	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
+	eor	$Xh.16b, $Xh.16b,$Xl.16b
+	eor	$Xl.16b, $Xl.16b,$t2.16b	//
+	ushr	$t2.2d, $t2.2d, #6
+	ushr	$Xl.2d, $Xl.2d, #1		//
+	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
+	eor	$Xl.16b, $Xl.16b, $t2.16b	//
+
+	subs	$len, $len, #16
+	bne	.Loop_neon
+
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	st1	{$Xl.16b}, [$Xi]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h
index 9a081ebd..dec1e56c 100644
--- a/src/crypto/fipsmodule/modes/internal.h
+++ b/src/crypto/fipsmodule/modes/internal.h
@@ -327,28 +327,12 @@ void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                   size_t len);
 
-#if defined(OPENSSL_ARM)
-// 32-bit ARM also has support for doing GCM with NEON instructions.
 OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); }
 
 void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]);
 void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]);
 void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                     size_t len);
-#else
-// AArch64 only has the ARMv8 versions of functions.
-OPENSSL_INLINE int gcm_neon_capable(void) { return 0; }
-OPENSSL_INLINE void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]) {
-  abort();
-}
-OPENSSL_INLINE void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]) {
-  abort();
-}
-OPENSSL_INLINE void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16],
-                                   const uint8_t *inp, size_t len) {
-  abort();
-}
-#endif  // OPENSSL_ARM
 
 #elif defined(OPENSSL_PPC64LE)
 #define GHASH_ASM_PPC64LE
diff --git a/src/crypto/stack/stack.c b/src/crypto/stack/stack.c
index ec557c02..599bd7b1 100644
--- a/src/crypto/stack/stack.c
+++ b/src/crypto/stack/stack.c
@@ -56,6 +56,7 @@
 
 #include <openssl/stack.h>
 
+#include <assert.h>
 #include <string.h>
 
 #include <openssl/mem.h>
@@ -272,36 +273,39 @@ int sk_find(const _STACK *sk, size_t *out_index, const void *p,
     return 0;
   }
 
-  // sk->comp is a function that takes pointers to pointers to elements, but
-  // qsort and bsearch take a comparison function that just takes pointers to
-  // elements. However, since we're passing an array of pointers to
-  // qsort/bsearch, we can just cast the comparison function and everything
-  // works.
+  // The stack is sorted, so binary search to find the element.
   //
-  // TODO(davidben): This is undefined behavior, but the call is in libc so,
-  // e.g., CFI does not notice. Unfortunately, |bsearch| is missing a void*
-  // parameter in its callback and |bsearch_s| is a mess of incompatibility.
-  const void *const *r = bsearch(&p, sk->data, sk->num, sizeof(void *),
-                                 (int (*)(const void *, const void *))sk->comp);
-  if (r == NULL) {
-    return 0;
-  }
-  size_t idx = ((void **)r) - sk->data;
-  // This function always returns the first result. Note this logic is, in the
-  // worst case, O(N) rather than O(log(N)). If this ever becomes a problem,
-  // restore https://boringssl-review.googlesource.com/c/boringssl/+/32115/
-  // which integrates the preference into the binary search.
-  while (idx > 0) {
-    const void *elem = sk->data[idx - 1];
-    if (call_cmp_func(sk->comp, &p, &elem) != 0) {
-      break;
+  // |lo| and |hi| maintain a half-open interval of where the answer may be. All
+  // indices such that |lo <= idx < hi| are candidates.
+  size_t lo = 0, hi = sk->num;
+  while (lo < hi) {
+    // Bias |mid| towards |lo|. See the |r == 0| case below.
+    size_t mid = lo + (hi - lo - 1) / 2;
+    assert(lo <= mid && mid < hi);
+    const void *elem = sk->data[mid];
+    int r = call_cmp_func(sk->comp, &p, &elem);
+    if (r > 0) {
+      lo = mid + 1;  // |mid| is too low.
+    } else if (r < 0) {
+      hi = mid;  // |mid| is too high.
+    } else {
+      // |mid| matches. However, this function returns the earliest match, so we
+      // can only return if the range has size one.
+      if (hi - lo == 1) {
+        if (out_index != NULL) {
+          *out_index = mid;
+        }
+        return 1;
+      }
+      // The sample is biased towards |lo|. |mid| can only be |hi - 1| if
+      // |hi - lo| was one, so this makes forward progress.
+      assert(mid + 1 < hi);
+      hi = mid + 1;
     }
-    idx--;
-  }
-  if (out_index) {
-    *out_index = idx;
   }
-  return 1;
+
+  assert(lo == hi);
+  return 0;  // Not found.
 }
 
 void *sk_shift(_STACK *sk) {
@@ -362,7 +366,10 @@ void sk_sort(_STACK *sk) {
     return;
   }
 
-  // See the comment in sk_find about this cast.
+  // sk->comp is a function that takes pointers to pointers to elements, but
+  // qsort take a comparison function that just takes pointers to elements.
+  // However, since we're passing an array of pointers to qsort, we can just
+  // cast the comparison function and everything works.
   //
   // TODO(davidben): This is undefined behavior, but the call is in libc so,
   // e.g., CFI does not notice. Unfortunately, |qsort| is missing a void*
diff --git a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
index bb5e4c09..5fa4053e 100644
--- a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
+++ b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
@@ -12,9 +12,6 @@ default	rel
 section	.text code align=64
 
 
-EXTERN	aes_nohw_encrypt
-EXTERN	aes_nohw_decrypt
-
 
 ALIGN	64
 _bsaes_encrypt8:
@@ -1080,17 +1077,13 @@ DB	102,15,56,0,244
 	DB	0F3h,0C3h		;repret
 
 
-EXTERN	aes_nohw_cbc_encrypt
 global	bsaes_cbc_encrypt
 
 ALIGN	16
 bsaes_cbc_encrypt:
 
-	mov	r11d,DWORD[48+rsp]
-	cmp	r11d,0
-	jne	NEAR aes_nohw_cbc_encrypt
-	cmp	r8,128
-	jb	NEAR aes_nohw_cbc_encrypt
+
+
 
 	mov	rax,rsp
 $L$cbc_dec_prologue:
@@ -1146,6 +1139,8 @@ $L$cbc_dec_body:
 
 	movdqu	xmm14,XMMWORD[rbx]
 	sub	r14,8
+	jc	NEAR $L$cbc_dec_loop_done
+
 $L$cbc_dec_loop:
 	movdqu	xmm15,XMMWORD[r12]
 	movdqu	xmm0,XMMWORD[16+r12]
@@ -1190,6 +1185,7 @@ $L$cbc_dec_loop:
 	sub	r14,8
 	jnc	NEAR $L$cbc_dec_loop
 
+$L$cbc_dec_loop_done:
 	add	r14,8
 	jz	NEAR $L$cbc_dec_done
 
@@ -1322,13 +1318,12 @@ $L$cbc_dec_two:
 	jmp	NEAR $L$cbc_dec_done
 ALIGN	16
 $L$cbc_dec_one:
-	lea	rcx,[r12]
-	lea	rdx,[32+rbp]
-	lea	r8,[r15]
-	call	aes_nohw_decrypt
-	pxor	xmm14,XMMWORD[32+rbp]
-	movdqu	XMMWORD[r13],xmm14
-	movdqa	xmm14,xmm15
+	movdqa	XMMWORD[32+rbp],xmm14
+	call	_bsaes_decrypt8
+	pxor	xmm15,XMMWORD[32+rbp]
+	movdqu	xmm14,XMMWORD[r12]
+	movdqu	XMMWORD[r13],xmm15
+	jmp	NEAR $L$cbc_dec_done
 
 $L$cbc_dec_done:
 	movdqu	XMMWORD[rbx],xmm14
@@ -1423,8 +1418,8 @@ $L$ctr_enc_body:
 	mov	r14,r8
 	mov	r15,r9
 	movdqa	XMMWORD[32+rbp],xmm0
-	cmp	r8,8
-	jb	NEAR $L$ctr_enc_short
+
+
 
 	mov	ebx,eax
 	shl	rax,7
@@ -1558,26 +1553,8 @@ $L$ctr_enc_loop_done:
 	movdqu	xmm13,XMMWORD[96+r12]
 	pxor	xmm1,xmm13
 	movdqu	XMMWORD[96+r13],xmm1
-	jmp	NEAR $L$ctr_enc_done
 
-ALIGN	16
-$L$ctr_enc_short:
-	lea	rcx,[32+rbp]
-	lea	rdx,[48+rbp]
-	lea	r8,[r15]
-	call	aes_nohw_encrypt
-	movdqu	xmm0,XMMWORD[r12]
-	lea	r12,[16+r12]
-	mov	eax,DWORD[44+rbp]
-	bswap	eax
-	pxor	xmm0,XMMWORD[48+rbp]
-	inc	eax
-	movdqu	XMMWORD[r13],xmm0
-	bswap	eax
-	lea	r13,[16+r13]
-	mov	DWORD[44+rsp],eax
-	dec	r14
-	jnz	NEAR $L$ctr_enc_short
+
 
 $L$ctr_enc_done:
 	lea	rax,[rsp]
author	Robert Sloan <varomodt@google.com>	2019-03-19 02:02:05 -0700
committer	android-build-merger <android-build-merger@google.com>	2019-03-19 02:02:05 -0700
commit	767904931a5f7012915cf015d54ca571dfb86e03 (patch)
tree	d5956e0da0ddbeb7e907378720fcbc8c6926beee
parent	8c9200ba9943ec79d6e957b2893f9a1455208778 (diff)
parent	bdfba2a0b5cfa78c35c71b35bd385a9acfc3ec14 (diff)
download	boringssl-767904931a5f7012915cf015d54ca571dfb86e03.tar.gz