external/boringssl: Sync to 45210dd4e21ace9d28cb76b3f83303fcdd2efcce.

This includes the following changes: https://boringssl.googlesource.com/boringssl/+log/a62dbf88d8a3c04446db833a1eb80a620cb1514d..45210dd4e21ace9d28cb76b3f83303fcdd2efcce Test: BoringSSL CTS Presubmits. Change-Id: I2f3cc22fb906078f64bc2af020fa14c3d0875c81
author: Robert Sloan <varomodt@google.com> 2018-02-05 09:07:34 -0800
committer: Robert Sloan <varomodt@google.com> 2018-02-05 09:07:39 -0800
commit: 8542c08a00c332af2ebca2a0c64b8d4d5fbd4cd2 (patch)
tree: 65345a0acda3104c65b39662f207fbc9239e9ad5 /src/crypto
parent: 309a31e32558286a3b92c754bd3051b962527c25 (diff)
download: boringssl-8542c08a00c332af2ebca2a0c64b8d4d5fbd4cd2.tar.gz
22 files changed, 689 insertions, 2510 deletions
diff --git a/src/crypto/curve25519/CMakeLists.txt b/src/crypto/curve25519/CMakeLists.txt
index 6f51d54f..4894fa8e 100644
--- a/src/crypto/curve25519/CMakeLists.txt
+++ b/src/crypto/curve25519/CMakeLists.txt
@@ -8,21 +8,12 @@ if (${ARCH} STREQUAL "arm")
   )
 endif()
 
-if (${ARCH} STREQUAL "x86_64")
-  set(
-    CURVE25519_ARCH_SOURCES
-
-    asm/x25519-asm-x86_64.S
-  )
-endif()
-
 add_library(
   curve25519
 
   OBJECT
 
   spake25519.c
-  x25519-x86_64.c
 
   ${CURVE25519_ARCH_SOURCES}
 )
diff --git a/src/crypto/curve25519/asm/x25519-asm-x86_64.S b/src/crypto/curve25519/asm/x25519-asm-x86_64.S
deleted file mode 100644
index 6cff53ee..00000000
--- a/src/crypto/curve25519/asm/x25519-asm-x86_64.S
+++ /dev/null
@@ -1,1894 +0,0 @@
-/* Copyright (c) 2015, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-/* This file is adapted from crypto_scalarmult/curve25519/amd64-51/ in
- * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public
- * domain licensed but the standard ISC license is included above to keep
- * licensing simple. */
-
-#if !defined(OPENSSL_NO_ASM)
-#if defined(__x86_64__)
-
-.data
-.p2align 4
-
-#if defined(__APPLE__)
-/* OS X's C ABI prefixes functions with underscore. */
-#define C_ABI(x) _ ## x
-#define HIDDEN .private_extern
-#else
-#define C_ABI(x) x
-#define HIDDEN .hidden
-#endif
-
-x25519_x86_64_REDMASK51:   .quad 0x0007FFFFFFFFFFFF
-x25519_x86_64_121666_213:  .quad 996687872
-x25519_x86_64_2P0:         .quad 0xFFFFFFFFFFFDA
-x25519_x86_64_2P1234:      .quad 0xFFFFFFFFFFFFE
-x25519_x86_64_4P0:         .quad 0x1FFFFFFFFFFFB4
-x25519_x86_64_4P1234:      .quad 0x1FFFFFFFFFFFFC
-x25519_x86_64_MU0:         .quad 0xED9CE5A30A2C131B
-x25519_x86_64_MU1:         .quad 0x2106215D086329A7
-x25519_x86_64_MU2:         .quad 0xFFFFFFFFFFFFFFEB
-x25519_x86_64_MU3:         .quad 0xFFFFFFFFFFFFFFFF
-x25519_x86_64_MU4:         .quad 0x000000000000000F
-x25519_x86_64_ORDER0:      .quad 0x5812631A5CF5D3ED
-x25519_x86_64_ORDER1:      .quad 0x14DEF9DEA2F79CD6
-x25519_x86_64_ORDER2:      .quad 0x0000000000000000
-x25519_x86_64_ORDER3:      .quad 0x1000000000000000
-x25519_x86_64_EC2D0:       .quad 1859910466990425
-x25519_x86_64_EC2D1:       .quad 932731440258426
-x25519_x86_64_EC2D2:       .quad 1072319116312658
-x25519_x86_64_EC2D3:       .quad 1815898335770999
-x25519_x86_64_EC2D4:       .quad 633789495995903
-x25519_x86_64__38:         .quad 38
-
-.text
-.p2align 5
-
-.globl C_ABI(x25519_x86_64_freeze)
-HIDDEN C_ABI(x25519_x86_64_freeze)
-C_ABI(x25519_x86_64_freeze):
-.cfi_startproc
-/* This is a leaf function and uses the redzone for saving registers. */
-movq %r12,-8(%rsp)
-.cfi_rel_offset r12, -8
-movq   0(%rdi),%rsi
-movq   8(%rdi),%rdx
-movq   16(%rdi),%rcx
-movq   24(%rdi),%r8
-movq   32(%rdi),%r9
-movq x25519_x86_64_REDMASK51(%rip),%rax
-mov  %rax,%r10
-sub  $18,%r10
-mov  $3,%r11
-._reduceloop:
-mov  %rsi,%r12
-shr  $51,%r12
-and  %rax,%rsi
-add  %r12,%rdx
-mov  %rdx,%r12
-shr  $51,%r12
-and  %rax,%rdx
-add  %r12,%rcx
-mov  %rcx,%r12
-shr  $51,%r12
-and  %rax,%rcx
-add  %r12,%r8
-mov  %r8,%r12
-shr  $51,%r12
-and  %rax,%r8
-add  %r12,%r9
-mov  %r9,%r12
-shr  $51,%r12
-and  %rax,%r9
-imulq  $19,%r12,%r12
-add  %r12,%rsi
-sub  $1,%r11
-ja ._reduceloop
-mov  $1,%r12
-cmp  %r10,%rsi
-cmovl %r11,%r12
-cmp  %rax,%rdx
-cmovne %r11,%r12
-cmp  %rax,%rcx
-cmovne %r11,%r12
-cmp  %rax,%r8
-cmovne %r11,%r12
-cmp  %rax,%r9
-cmovne %r11,%r12
-neg  %r12
-and  %r12,%rax
-and  %r12,%r10
-sub  %r10,%rsi
-sub  %rax,%rdx
-sub  %rax,%rcx
-sub  %rax,%r8
-sub  %rax,%r9
-movq   %rsi,0(%rdi)
-movq   %rdx,8(%rdi)
-movq   %rcx,16(%rdi)
-movq   %r8,24(%rdi)
-movq   %r9,32(%rdi)
-movq -8(%rsp),%r12
-ret
-.cfi_endproc
-
-.p2align 5
-.globl C_ABI(x25519_x86_64_mul)
-HIDDEN C_ABI(x25519_x86_64_mul)
-C_ABI(x25519_x86_64_mul):
-.cfi_startproc
-/* This is a leaf function and uses the redzone for saving registers. */
-movq %r12,-8(%rsp)
-.cfi_rel_offset r12, -8
-movq %r13,-16(%rsp)
-.cfi_rel_offset r13, -16
-movq %r14,-24(%rsp)
-.cfi_rel_offset r14, -24
-movq %r15,-32(%rsp)
-.cfi_rel_offset r15, -32
-movq %rbx,-40(%rsp)
-.cfi_rel_offset rbx, -40
-movq %rbp,-48(%rsp)
-.cfi_rel_offset rbp, -48
-mov  %rdx,%rcx
-movq   24(%rsi),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,-64(%rsp)
-mulq  16(%rcx)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq   32(%rsi),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,-72(%rsp)
-mulq  8(%rcx)
-add  %rax,%r8
-adc %rdx,%r9
-movq   0(%rsi),%rax
-mulq  0(%rcx)
-add  %rax,%r8
-adc %rdx,%r9
-movq   0(%rsi),%rax
-mulq  8(%rcx)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq   0(%rsi),%rax
-mulq  16(%rcx)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq   0(%rsi),%rax
-mulq  24(%rcx)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq   0(%rsi),%rax
-mulq  32(%rcx)
-mov  %rax,%rbx
-mov  %rdx,%rbp
-movq   8(%rsi),%rax
-mulq  0(%rcx)
-add  %rax,%r10
-adc %rdx,%r11
-movq   8(%rsi),%rax
-mulq  8(%rcx)
-add  %rax,%r12
-adc %rdx,%r13
-movq   8(%rsi),%rax
-mulq  16(%rcx)
-add  %rax,%r14
-adc %rdx,%r15
-movq   8(%rsi),%rax
-mulq  24(%rcx)
-add  %rax,%rbx
-adc %rdx,%rbp
-movq   8(%rsi),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rcx)
-add  %rax,%r8
-adc %rdx,%r9
-movq   16(%rsi),%rax
-mulq  0(%rcx)
-add  %rax,%r12
-adc %rdx,%r13
-movq   16(%rsi),%rax
-mulq  8(%rcx)
-add  %rax,%r14
-adc %rdx,%r15
-movq   16(%rsi),%rax
-mulq  16(%rcx)
-add  %rax,%rbx
-adc %rdx,%rbp
-movq   16(%rsi),%rdx
-imulq  $19,%rdx,%rax
-mulq  24(%rcx)
-add  %rax,%r8
-adc %rdx,%r9
-movq   16(%rsi),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rcx)
-add  %rax,%r10
-adc %rdx,%r11
-movq   24(%rsi),%rax
-mulq  0(%rcx)
-add  %rax,%r14
-adc %rdx,%r15
-movq   24(%rsi),%rax
-mulq  8(%rcx)
-add  %rax,%rbx
-adc %rdx,%rbp
-movq -64(%rsp),%rax
-mulq  24(%rcx)
-add  %rax,%r10
-adc %rdx,%r11
-movq -64(%rsp),%rax
-mulq  32(%rcx)
-add  %rax,%r12
-adc %rdx,%r13
-movq   32(%rsi),%rax
-mulq  0(%rcx)
-add  %rax,%rbx
-adc %rdx,%rbp
-movq -72(%rsp),%rax
-mulq  16(%rcx)
-add  %rax,%r10
-adc %rdx,%r11
-movq -72(%rsp),%rax
-mulq  24(%rcx)
-add  %rax,%r12
-adc %rdx,%r13
-movq -72(%rsp),%rax
-mulq  32(%rcx)
-add  %rax,%r14
-adc %rdx,%r15
-movq x25519_x86_64_REDMASK51(%rip),%rsi
-shld $13,%r8,%r9
-and  %rsi,%r8
-shld $13,%r10,%r11
-and  %rsi,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rsi,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rsi,%r14
-add  %r13,%r14
-shld $13,%rbx,%rbp
-and  %rsi,%rbx
-add  %r15,%rbx
-imulq  $19,%rbp,%rdx
-add  %rdx,%r8
-mov  %r8,%rdx
-shr  $51,%rdx
-add  %r10,%rdx
-mov  %rdx,%rcx
-shr  $51,%rdx
-and  %rsi,%r8
-add  %r12,%rdx
-mov  %rdx,%r9
-shr  $51,%rdx
-and  %rsi,%rcx
-add  %r14,%rdx
-mov  %rdx,%rax
-shr  $51,%rdx
-and  %rsi,%r9
-add  %rbx,%rdx
-mov  %rdx,%r10
-shr  $51,%rdx
-and  %rsi,%rax
-imulq  $19,%rdx,%rdx
-add  %rdx,%r8
-and  %rsi,%r10
-movq   %r8,0(%rdi)
-movq   %rcx,8(%rdi)
-movq   %r9,16(%rdi)
-movq   %rax,24(%rdi)
-movq   %r10,32(%rdi)
-movq -8(%rsp),%r12
-movq -16(%rsp),%r13
-movq -24(%rsp),%r14
-movq -32(%rsp),%r15
-movq -40(%rsp),%rbx
-movq -48(%rsp),%rbp
-ret
-.cfi_endproc
-
-.p2align 5
-.globl C_ABI(x25519_x86_64_square)
-HIDDEN C_ABI(x25519_x86_64_square)
-C_ABI(x25519_x86_64_square):
-.cfi_startproc
-/* This is a leaf function and uses the redzone for saving registers. */
-movq %r12,-8(%rsp)
-.cfi_rel_offset r12, -8
-movq %r13,-16(%rsp)
-.cfi_rel_offset r13, -16
-movq %r14,-24(%rsp)
-.cfi_rel_offset r14, -24
-movq %r15,-32(%rsp)
-.cfi_rel_offset r15, -32
-movq %rbx,-40(%rsp)
-.cfi_rel_offset rbx, -40
-movq   0(%rsi),%rax
-mulq  0(%rsi)
-mov  %rax,%rcx
-mov  %rdx,%r8
-movq   0(%rsi),%rax
-shl  $1,%rax
-mulq  8(%rsi)
-mov  %rax,%r9
-mov  %rdx,%r10
-movq   0(%rsi),%rax
-shl  $1,%rax
-mulq  16(%rsi)
-mov  %rax,%r11
-mov  %rdx,%r12
-movq   0(%rsi),%rax
-shl  $1,%rax
-mulq  24(%rsi)
-mov  %rax,%r13
-mov  %rdx,%r14
-movq   0(%rsi),%rax
-shl  $1,%rax
-mulq  32(%rsi)
-mov  %rax,%r15
-mov  %rdx,%rbx
-movq   8(%rsi),%rax
-mulq  8(%rsi)
-add  %rax,%r11
-adc %rdx,%r12
-movq   8(%rsi),%rax
-shl  $1,%rax
-mulq  16(%rsi)
-add  %rax,%r13
-adc %rdx,%r14
-movq   8(%rsi),%rax
-shl  $1,%rax
-mulq  24(%rsi)
-add  %rax,%r15
-adc %rdx,%rbx
-movq   8(%rsi),%rdx
-imulq  $38,%rdx,%rax
-mulq  32(%rsi)
-add  %rax,%rcx
-adc %rdx,%r8
-movq   16(%rsi),%rax
-mulq  16(%rsi)
-add  %rax,%r15
-adc %rdx,%rbx
-movq   16(%rsi),%rdx
-imulq  $38,%rdx,%rax
-mulq  24(%rsi)
-add  %rax,%rcx
-adc %rdx,%r8
-movq   16(%rsi),%rdx
-imulq  $38,%rdx,%rax
-mulq  32(%rsi)
-add  %rax,%r9
-adc %rdx,%r10
-movq   24(%rsi),%rdx
-imulq  $19,%rdx,%rax
-mulq  24(%rsi)
-add  %rax,%r9
-adc %rdx,%r10
-movq   24(%rsi),%rdx
-imulq  $38,%rdx,%rax
-mulq  32(%rsi)
-add  %rax,%r11
-adc %rdx,%r12
-movq   32(%rsi),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rsi)
-add  %rax,%r13
-adc %rdx,%r14
-movq x25519_x86_64_REDMASK51(%rip),%rsi
-shld $13,%rcx,%r8
-and  %rsi,%rcx
-shld $13,%r9,%r10
-and  %rsi,%r9
-add  %r8,%r9
-shld $13,%r11,%r12
-and  %rsi,%r11
-add  %r10,%r11
-shld $13,%r13,%r14
-and  %rsi,%r13
-add  %r12,%r13
-shld $13,%r15,%rbx
-and  %rsi,%r15
-add  %r14,%r15
-imulq  $19,%rbx,%rdx
-add  %rdx,%rcx
-mov  %rcx,%rdx
-shr  $51,%rdx
-add  %r9,%rdx
-and  %rsi,%rcx
-mov  %rdx,%r8
-shr  $51,%rdx
-add  %r11,%rdx
-and  %rsi,%r8
-mov  %rdx,%r9
-shr  $51,%rdx
-add  %r13,%rdx
-and  %rsi,%r9
-mov  %rdx,%rax
-shr  $51,%rdx
-add  %r15,%rdx
-and  %rsi,%rax
-mov  %rdx,%r10
-shr  $51,%rdx
-imulq  $19,%rdx,%rdx
-add  %rdx,%rcx
-and  %rsi,%r10
-movq   %rcx,0(%rdi)
-movq   %r8,8(%rdi)
-movq   %r9,16(%rdi)
-movq   %rax,24(%rdi)
-movq   %r10,32(%rdi)
-movq -8(%rsp),%r12
-movq -16(%rsp),%r13
-movq -24(%rsp),%r14
-movq -32(%rsp),%r15
-movq -40(%rsp),%rbx
-ret
-.cfi_endproc
-
-.p2align 5
-.globl C_ABI(x25519_x86_64_ladderstep)
-HIDDEN C_ABI(x25519_x86_64_ladderstep)
-C_ABI(x25519_x86_64_ladderstep):
-.cfi_startproc
-sub $344,%rsp
-.cfi_adjust_cfa_offset 344
-movq %r12,296(%rsp)
-.cfi_rel_offset r12, 296
-movq %r13,304(%rsp)
-.cfi_rel_offset r13, 304
-movq %r14,312(%rsp)
-.cfi_rel_offset r14, 312
-movq %r15,320(%rsp)
-.cfi_rel_offset r15, 320
-movq %rbx,328(%rsp)
-.cfi_rel_offset rbx, 328
-movq %rbp,336(%rsp)
-.cfi_rel_offset rbp, 336
-movq   40(%rdi),%rsi
-movq   48(%rdi),%rdx
-movq   56(%rdi),%rcx
-movq   64(%rdi),%r8
-movq   72(%rdi),%r9
-mov  %rsi,%rax
-mov  %rdx,%r10
-mov  %rcx,%r11
-mov  %r8,%r12
-mov  %r9,%r13
-add  x25519_x86_64_2P0(%rip),%rax
-add  x25519_x86_64_2P1234(%rip),%r10
-add  x25519_x86_64_2P1234(%rip),%r11
-add  x25519_x86_64_2P1234(%rip),%r12
-add  x25519_x86_64_2P1234(%rip),%r13
-addq 80(%rdi),%rsi
-addq 88(%rdi),%rdx
-addq 96(%rdi),%rcx
-addq 104(%rdi),%r8
-addq 112(%rdi),%r9
-subq 80(%rdi),%rax
-subq 88(%rdi),%r10
-subq 96(%rdi),%r11
-subq 104(%rdi),%r12
-subq 112(%rdi),%r13
-movq %rsi,0(%rsp)
-movq %rdx,8(%rsp)
-movq %rcx,16(%rsp)
-movq %r8,24(%rsp)
-movq %r9,32(%rsp)
-movq %rax,40(%rsp)
-movq %r10,48(%rsp)
-movq %r11,56(%rsp)
-movq %r12,64(%rsp)
-movq %r13,72(%rsp)
-movq 40(%rsp),%rax
-mulq  40(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 40(%rsp),%rax
-shl  $1,%rax
-mulq  48(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq 40(%rsp),%rax
-shl  $1,%rax
-mulq  56(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq 40(%rsp),%rax
-shl  $1,%rax
-mulq  64(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq 40(%rsp),%rax
-shl  $1,%rax
-mulq  72(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq 48(%rsp),%rax
-mulq  48(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 48(%rsp),%rax
-shl  $1,%rax
-mulq  56(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 48(%rsp),%rax
-shl  $1,%rax
-mulq  64(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 48(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  72(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 56(%rsp),%rax
-mulq  56(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 56(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  64(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 56(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  72(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 64(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  64(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 64(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  72(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 72(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  72(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-and  %rdx,%rsi
-mov  %rcx,%r8
-shr  $51,%rcx
-add  %r10,%rcx
-and  %rdx,%r8
-mov  %rcx,%r9
-shr  $51,%rcx
-add  %r12,%rcx
-and  %rdx,%r9
-mov  %rcx,%rax
-shr  $51,%rcx
-add  %r14,%rcx
-and  %rdx,%rax
-mov  %rcx,%r10
-shr  $51,%rcx
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq %rsi,80(%rsp)
-movq %r8,88(%rsp)
-movq %r9,96(%rsp)
-movq %rax,104(%rsp)
-movq %r10,112(%rsp)
-movq 0(%rsp),%rax
-mulq  0(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 0(%rsp),%rax
-shl  $1,%rax
-mulq  8(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq 0(%rsp),%rax
-shl  $1,%rax
-mulq  16(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq 0(%rsp),%rax
-shl  $1,%rax
-mulq  24(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq 0(%rsp),%rax
-shl  $1,%rax
-mulq  32(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq 8(%rsp),%rax
-mulq  8(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 8(%rsp),%rax
-shl  $1,%rax
-mulq  16(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 8(%rsp),%rax
-shl  $1,%rax
-mulq  24(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 8(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  32(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 16(%rsp),%rax
-mulq  16(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 16(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  24(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 16(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  32(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 24(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  24(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 24(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  32(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 32(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-and  %rdx,%rsi
-mov  %rcx,%r8
-shr  $51,%rcx
-add  %r10,%rcx
-and  %rdx,%r8
-mov  %rcx,%r9
-shr  $51,%rcx
-add  %r12,%rcx
-and  %rdx,%r9
-mov  %rcx,%rax
-shr  $51,%rcx
-add  %r14,%rcx
-and  %rdx,%rax
-mov  %rcx,%r10
-shr  $51,%rcx
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq %rsi,120(%rsp)
-movq %r8,128(%rsp)
-movq %r9,136(%rsp)
-movq %rax,144(%rsp)
-movq %r10,152(%rsp)
-mov  %rsi,%rsi
-mov  %r8,%rdx
-mov  %r9,%rcx
-mov  %rax,%r8
-mov  %r10,%r9
-add  x25519_x86_64_2P0(%rip),%rsi
-add  x25519_x86_64_2P1234(%rip),%rdx
-add  x25519_x86_64_2P1234(%rip),%rcx
-add  x25519_x86_64_2P1234(%rip),%r8
-add  x25519_x86_64_2P1234(%rip),%r9
-subq 80(%rsp),%rsi
-subq 88(%rsp),%rdx
-subq 96(%rsp),%rcx
-subq 104(%rsp),%r8
-subq 112(%rsp),%r9
-movq %rsi,160(%rsp)
-movq %rdx,168(%rsp)
-movq %rcx,176(%rsp)
-movq %r8,184(%rsp)
-movq %r9,192(%rsp)
-movq   120(%rdi),%rsi
-movq   128(%rdi),%rdx
-movq   136(%rdi),%rcx
-movq   144(%rdi),%r8
-movq   152(%rdi),%r9
-mov  %rsi,%rax
-mov  %rdx,%r10
-mov  %rcx,%r11
-mov  %r8,%r12
-mov  %r9,%r13
-add  x25519_x86_64_2P0(%rip),%rax
-add  x25519_x86_64_2P1234(%rip),%r10
-add  x25519_x86_64_2P1234(%rip),%r11
-add  x25519_x86_64_2P1234(%rip),%r12
-add  x25519_x86_64_2P1234(%rip),%r13
-addq 160(%rdi),%rsi
-addq 168(%rdi),%rdx
-addq 176(%rdi),%rcx
-addq 184(%rdi),%r8
-addq 192(%rdi),%r9
-subq 160(%rdi),%rax
-subq 168(%rdi),%r10
-subq 176(%rdi),%r11
-subq 184(%rdi),%r12
-subq 192(%rdi),%r13
-movq %rsi,200(%rsp)
-movq %rdx,208(%rsp)
-movq %rcx,216(%rsp)
-movq %r8,224(%rsp)
-movq %r9,232(%rsp)
-movq %rax,240(%rsp)
-movq %r10,248(%rsp)
-movq %r11,256(%rsp)
-movq %r12,264(%rsp)
-movq %r13,272(%rsp)
-movq 224(%rsp),%rsi
-imulq  $19,%rsi,%rax
-movq %rax,280(%rsp)
-mulq  56(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 232(%rsp),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,288(%rsp)
-mulq  48(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 200(%rsp),%rax
-mulq  40(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 200(%rsp),%rax
-mulq  48(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq 200(%rsp),%rax
-mulq  56(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq 200(%rsp),%rax
-mulq  64(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq 200(%rsp),%rax
-mulq  72(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq 208(%rsp),%rax
-mulq  40(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 208(%rsp),%rax
-mulq  48(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 208(%rsp),%rax
-mulq  56(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 208(%rsp),%rax
-mulq  64(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 208(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  72(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 216(%rsp),%rax
-mulq  40(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 216(%rsp),%rax
-mulq  48(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 216(%rsp),%rax
-mulq  56(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 216(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  64(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 216(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  72(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 224(%rsp),%rax
-mulq  40(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 224(%rsp),%rax
-mulq  48(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 280(%rsp),%rax
-mulq  64(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 280(%rsp),%rax
-mulq  72(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 232(%rsp),%rax
-mulq  40(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 288(%rsp),%rax
-mulq  56(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 288(%rsp),%rax
-mulq  64(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 288(%rsp),%rax
-mulq  72(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-mov  %rcx,%r8
-shr  $51,%rcx
-and  %rdx,%rsi
-add  %r10,%rcx
-mov  %rcx,%r9
-shr  $51,%rcx
-and  %rdx,%r8
-add  %r12,%rcx
-mov  %rcx,%rax
-shr  $51,%rcx
-and  %rdx,%r9
-add  %r14,%rcx
-mov  %rcx,%r10
-shr  $51,%rcx
-and  %rdx,%rax
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq %rsi,40(%rsp)
-movq %r8,48(%rsp)
-movq %r9,56(%rsp)
-movq %rax,64(%rsp)
-movq %r10,72(%rsp)
-movq 264(%rsp),%rsi
-imulq  $19,%rsi,%rax
-movq %rax,200(%rsp)
-mulq  16(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 272(%rsp),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,208(%rsp)
-mulq  8(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 240(%rsp),%rax
-mulq  0(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 240(%rsp),%rax
-mulq  8(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq 240(%rsp),%rax
-mulq  16(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq 240(%rsp),%rax
-mulq  24(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq 240(%rsp),%rax
-mulq  32(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq 248(%rsp),%rax
-mulq  0(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 248(%rsp),%rax
-mulq  8(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 248(%rsp),%rax
-mulq  16(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 248(%rsp),%rax
-mulq  24(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 248(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 256(%rsp),%rax
-mulq  0(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 256(%rsp),%rax
-mulq  8(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 256(%rsp),%rax
-mulq  16(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 256(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  24(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 256(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 264(%rsp),%rax
-mulq  0(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 264(%rsp),%rax
-mulq  8(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 200(%rsp),%rax
-mulq  24(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 200(%rsp),%rax
-mulq  32(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 272(%rsp),%rax
-mulq  0(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 208(%rsp),%rax
-mulq  16(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 208(%rsp),%rax
-mulq  24(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 208(%rsp),%rax
-mulq  32(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-mov  %rcx,%r8
-shr  $51,%rcx
-and  %rdx,%rsi
-add  %r10,%rcx
-mov  %rcx,%r9
-shr  $51,%rcx
-and  %rdx,%r8
-add  %r12,%rcx
-mov  %rcx,%rax
-shr  $51,%rcx
-and  %rdx,%r9
-add  %r14,%rcx
-mov  %rcx,%r10
-shr  $51,%rcx
-and  %rdx,%rax
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-mov  %rsi,%rdx
-mov  %r8,%rcx
-mov  %r9,%r11
-mov  %rax,%r12
-mov  %r10,%r13
-add  x25519_x86_64_2P0(%rip),%rdx
-add  x25519_x86_64_2P1234(%rip),%rcx
-add  x25519_x86_64_2P1234(%rip),%r11
-add  x25519_x86_64_2P1234(%rip),%r12
-add  x25519_x86_64_2P1234(%rip),%r13
-addq 40(%rsp),%rsi
-addq 48(%rsp),%r8
-addq 56(%rsp),%r9
-addq 64(%rsp),%rax
-addq 72(%rsp),%r10
-subq 40(%rsp),%rdx
-subq 48(%rsp),%rcx
-subq 56(%rsp),%r11
-subq 64(%rsp),%r12
-subq 72(%rsp),%r13
-movq   %rsi,120(%rdi)
-movq   %r8,128(%rdi)
-movq   %r9,136(%rdi)
-movq   %rax,144(%rdi)
-movq   %r10,152(%rdi)
-movq   %rdx,160(%rdi)
-movq   %rcx,168(%rdi)
-movq   %r11,176(%rdi)
-movq   %r12,184(%rdi)
-movq   %r13,192(%rdi)
-movq   120(%rdi),%rax
-mulq  120(%rdi)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq   120(%rdi),%rax
-shl  $1,%rax
-mulq  128(%rdi)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq   120(%rdi),%rax
-shl  $1,%rax
-mulq  136(%rdi)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq   120(%rdi),%rax
-shl  $1,%rax
-mulq  144(%rdi)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq   120(%rdi),%rax
-shl  $1,%rax
-mulq  152(%rdi)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq   128(%rdi),%rax
-mulq  128(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   128(%rdi),%rax
-shl  $1,%rax
-mulq  136(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq   128(%rdi),%rax
-shl  $1,%rax
-mulq  144(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq   128(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  152(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   136(%rdi),%rax
-mulq  136(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq   136(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  144(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   136(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  152(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq   144(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  144(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq   144(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  152(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   152(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  152(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-and  %rdx,%rsi
-mov  %rcx,%r8
-shr  $51,%rcx
-add  %r10,%rcx
-and  %rdx,%r8
-mov  %rcx,%r9
-shr  $51,%rcx
-add  %r12,%rcx
-and  %rdx,%r9
-mov  %rcx,%rax
-shr  $51,%rcx
-add  %r14,%rcx
-and  %rdx,%rax
-mov  %rcx,%r10
-shr  $51,%rcx
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq   %rsi,120(%rdi)
-movq   %r8,128(%rdi)
-movq   %r9,136(%rdi)
-movq   %rax,144(%rdi)
-movq   %r10,152(%rdi)
-movq   160(%rdi),%rax
-mulq  160(%rdi)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq   160(%rdi),%rax
-shl  $1,%rax
-mulq  168(%rdi)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq   160(%rdi),%rax
-shl  $1,%rax
-mulq  176(%rdi)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq   160(%rdi),%rax
-shl  $1,%rax
-mulq  184(%rdi)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq   160(%rdi),%rax
-shl  $1,%rax
-mulq  192(%rdi)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq   168(%rdi),%rax
-mulq  168(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   168(%rdi),%rax
-shl  $1,%rax
-mulq  176(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq   168(%rdi),%rax
-shl  $1,%rax
-mulq  184(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq   168(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  192(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   176(%rdi),%rax
-mulq  176(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq   176(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  184(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   176(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  192(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq   184(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  184(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq   184(%rdi),%rdx
-imulq  $38,%rdx,%rax
-mulq  192(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   192(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  192(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-and  %rdx,%rsi
-mov  %rcx,%r8
-shr  $51,%rcx
-add  %r10,%rcx
-and  %rdx,%r8
-mov  %rcx,%r9
-shr  $51,%rcx
-add  %r12,%rcx
-and  %rdx,%r9
-mov  %rcx,%rax
-shr  $51,%rcx
-add  %r14,%rcx
-and  %rdx,%rax
-mov  %rcx,%r10
-shr  $51,%rcx
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq   %rsi,160(%rdi)
-movq   %r8,168(%rdi)
-movq   %r9,176(%rdi)
-movq   %rax,184(%rdi)
-movq   %r10,192(%rdi)
-movq   184(%rdi),%rsi
-imulq  $19,%rsi,%rax
-movq %rax,0(%rsp)
-mulq  16(%rdi)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq   192(%rdi),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,8(%rsp)
-mulq  8(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   160(%rdi),%rax
-mulq  0(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   160(%rdi),%rax
-mulq  8(%rdi)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq   160(%rdi),%rax
-mulq  16(%rdi)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq   160(%rdi),%rax
-mulq  24(%rdi)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq   160(%rdi),%rax
-mulq  32(%rdi)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq   168(%rdi),%rax
-mulq  0(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq   168(%rdi),%rax
-mulq  8(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   168(%rdi),%rax
-mulq  16(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq   168(%rdi),%rax
-mulq  24(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq   168(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   176(%rdi),%rax
-mulq  0(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   176(%rdi),%rax
-mulq  8(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq   176(%rdi),%rax
-mulq  16(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq   176(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  24(%rdi)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   176(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  32(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq   184(%rdi),%rax
-mulq  0(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq   184(%rdi),%rax
-mulq  8(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq 0(%rsp),%rax
-mulq  24(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq 0(%rsp),%rax
-mulq  32(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq   192(%rdi),%rax
-mulq  0(%rdi)
-add  %rax,%r14
-adc %rdx,%r15
-movq 8(%rsp),%rax
-mulq  16(%rdi)
-add  %rax,%r8
-adc %rdx,%r9
-movq 8(%rsp),%rax
-mulq  24(%rdi)
-add  %rax,%r10
-adc %rdx,%r11
-movq 8(%rsp),%rax
-mulq  32(%rdi)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-mov  %rcx,%r8
-shr  $51,%rcx
-and  %rdx,%rsi
-add  %r10,%rcx
-mov  %rcx,%r9
-shr  $51,%rcx
-and  %rdx,%r8
-add  %r12,%rcx
-mov  %rcx,%rax
-shr  $51,%rcx
-and  %rdx,%r9
-add  %r14,%rcx
-mov  %rcx,%r10
-shr  $51,%rcx
-and  %rdx,%rax
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq   %rsi,160(%rdi)
-movq   %r8,168(%rdi)
-movq   %r9,176(%rdi)
-movq   %rax,184(%rdi)
-movq   %r10,192(%rdi)
-movq 144(%rsp),%rsi
-imulq  $19,%rsi,%rax
-movq %rax,0(%rsp)
-mulq  96(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 152(%rsp),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,8(%rsp)
-mulq  88(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 120(%rsp),%rax
-mulq  80(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 120(%rsp),%rax
-mulq  88(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq 120(%rsp),%rax
-mulq  96(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq 120(%rsp),%rax
-mulq  104(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq 120(%rsp),%rax
-mulq  112(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq 128(%rsp),%rax
-mulq  80(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 128(%rsp),%rax
-mulq  88(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 128(%rsp),%rax
-mulq  96(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 128(%rsp),%rax
-mulq  104(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 128(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  112(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 136(%rsp),%rax
-mulq  80(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 136(%rsp),%rax
-mulq  88(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 136(%rsp),%rax
-mulq  96(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 136(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  104(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 136(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  112(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 144(%rsp),%rax
-mulq  80(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 144(%rsp),%rax
-mulq  88(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 0(%rsp),%rax
-mulq  104(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 0(%rsp),%rax
-mulq  112(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 152(%rsp),%rax
-mulq  80(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 8(%rsp),%rax
-mulq  96(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 8(%rsp),%rax
-mulq  104(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 8(%rsp),%rax
-mulq  112(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-mov  %rcx,%r8
-shr  $51,%rcx
-and  %rdx,%rsi
-add  %r10,%rcx
-mov  %rcx,%r9
-shr  $51,%rcx
-and  %rdx,%r8
-add  %r12,%rcx
-mov  %rcx,%rax
-shr  $51,%rcx
-and  %rdx,%r9
-add  %r14,%rcx
-mov  %rcx,%r10
-shr  $51,%rcx
-and  %rdx,%rax
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq   %rsi,40(%rdi)
-movq   %r8,48(%rdi)
-movq   %r9,56(%rdi)
-movq   %rax,64(%rdi)
-movq   %r10,72(%rdi)
-movq 160(%rsp),%rax
-mulq  x25519_x86_64_121666_213(%rip)
-shr  $13,%rax
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 168(%rsp),%rax
-mulq  x25519_x86_64_121666_213(%rip)
-shr  $13,%rax
-add  %rax,%rcx
-mov  %rdx,%r8
-movq 176(%rsp),%rax
-mulq  x25519_x86_64_121666_213(%rip)
-shr  $13,%rax
-add  %rax,%r8
-mov  %rdx,%r9
-movq 184(%rsp),%rax
-mulq  x25519_x86_64_121666_213(%rip)
-shr  $13,%rax
-add  %rax,%r9
-mov  %rdx,%r10
-movq 192(%rsp),%rax
-mulq  x25519_x86_64_121666_213(%rip)
-shr  $13,%rax
-add  %rax,%r10
-imulq  $19,%rdx,%rdx
-add  %rdx,%rsi
-addq 80(%rsp),%rsi
-addq 88(%rsp),%rcx
-addq 96(%rsp),%r8
-addq 104(%rsp),%r9
-addq 112(%rsp),%r10
-movq   %rsi,80(%rdi)
-movq   %rcx,88(%rdi)
-movq   %r8,96(%rdi)
-movq   %r9,104(%rdi)
-movq   %r10,112(%rdi)
-movq   104(%rdi),%rsi
-imulq  $19,%rsi,%rax
-movq %rax,0(%rsp)
-mulq  176(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq   112(%rdi),%rdx
-imulq  $19,%rdx,%rax
-movq %rax,8(%rsp)
-mulq  168(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   80(%rdi),%rax
-mulq  160(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   80(%rdi),%rax
-mulq  168(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq   80(%rdi),%rax
-mulq  176(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq   80(%rdi),%rax
-mulq  184(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq   80(%rdi),%rax
-mulq  192(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq   88(%rdi),%rax
-mulq  160(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq   88(%rdi),%rax
-mulq  168(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq   88(%rdi),%rax
-mulq  176(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq   88(%rdi),%rax
-mulq  184(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq   88(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  192(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   96(%rdi),%rax
-mulq  160(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq   96(%rdi),%rax
-mulq  168(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq   96(%rdi),%rax
-mulq  176(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq   96(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  184(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq   96(%rdi),%rdx
-imulq  $19,%rdx,%rax
-mulq  192(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq   104(%rdi),%rax
-mulq  160(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq   104(%rdi),%rax
-mulq  168(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 0(%rsp),%rax
-mulq  184(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 0(%rsp),%rax
-mulq  192(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq   112(%rdi),%rax
-mulq  160(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 8(%rsp),%rax
-mulq  176(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 8(%rsp),%rax
-mulq  184(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 8(%rsp),%rax
-mulq  192(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-mov  %rcx,%r8
-shr  $51,%rcx
-and  %rdx,%rsi
-add  %r10,%rcx
-mov  %rcx,%r9
-shr  $51,%rcx
-and  %rdx,%r8
-add  %r12,%rcx
-mov  %rcx,%rax
-shr  $51,%rcx
-and  %rdx,%r9
-add  %r14,%rcx
-mov  %rcx,%r10
-shr  $51,%rcx
-and  %rdx,%rax
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq   %rsi,80(%rdi)
-movq   %r8,88(%rdi)
-movq   %r9,96(%rdi)
-movq   %rax,104(%rdi)
-movq   %r10,112(%rdi)
-movq 296(%rsp),%r12
-movq 304(%rsp),%r13
-movq 312(%rsp),%r14
-movq 320(%rsp),%r15
-movq 328(%rsp),%rbx
-movq 336(%rsp),%rbp
-add $344,%rsp
-.cfi_adjust_cfa_offset -344
-ret
-.cfi_endproc
-
-.p2align 5
-.globl C_ABI(x25519_x86_64_work_cswap)
-HIDDEN C_ABI(x25519_x86_64_work_cswap)
-C_ABI(x25519_x86_64_work_cswap):
-.cfi_startproc
-subq $1,%rsi
-notq %rsi
-movq %rsi,%xmm15
-pshufd $0x44,%xmm15,%xmm15
-movdqu 0(%rdi),%xmm0
-movdqu 16(%rdi),%xmm2
-movdqu 32(%rdi),%xmm4
-movdqu 48(%rdi),%xmm6
-movdqu 64(%rdi),%xmm8
-movdqu 80(%rdi),%xmm1
-movdqu 96(%rdi),%xmm3
-movdqu 112(%rdi),%xmm5
-movdqu 128(%rdi),%xmm7
-movdqu 144(%rdi),%xmm9
-movdqa %xmm1,%xmm10
-movdqa %xmm3,%xmm11
-movdqa %xmm5,%xmm12
-movdqa %xmm7,%xmm13
-movdqa %xmm9,%xmm14
-pxor %xmm0,%xmm10
-pxor %xmm2,%xmm11
-pxor %xmm4,%xmm12
-pxor %xmm6,%xmm13
-pxor %xmm8,%xmm14
-pand %xmm15,%xmm10
-pand %xmm15,%xmm11
-pand %xmm15,%xmm12
-pand %xmm15,%xmm13
-pand %xmm15,%xmm14
-pxor %xmm10,%xmm0
-pxor %xmm10,%xmm1
-pxor %xmm11,%xmm2
-pxor %xmm11,%xmm3
-pxor %xmm12,%xmm4
-pxor %xmm12,%xmm5
-pxor %xmm13,%xmm6
-pxor %xmm13,%xmm7
-pxor %xmm14,%xmm8
-pxor %xmm14,%xmm9
-movdqu %xmm0,0(%rdi)
-movdqu %xmm2,16(%rdi)
-movdqu %xmm4,32(%rdi)
-movdqu %xmm6,48(%rdi)
-movdqu %xmm8,64(%rdi)
-movdqu %xmm1,80(%rdi)
-movdqu %xmm3,96(%rdi)
-movdqu %xmm5,112(%rdi)
-movdqu %xmm7,128(%rdi)
-movdqu %xmm9,144(%rdi)
-ret
-.cfi_endproc
-
-#endif  /* __x86_64__ */
-#endif  /* !OPENSSL_NO_ASM */
diff --git a/src/crypto/curve25519/ed25519_test.cc b/src/crypto/curve25519/ed25519_test.cc
index 31216f1b..4f34675b 100644
--- a/src/crypto/curve25519/ed25519_test.cc
+++ b/src/crypto/curve25519/ed25519_test.cc
@@ -44,6 +44,28 @@ TEST(Ed25519Test, TestVectors) {
   });
 }
 
+TEST(Ed25519Test, Malleability) {
+  // https://tools.ietf.org/html/rfc8032#section-5.1.7 adds an additional test
+  // that s be in [0, order). This prevents someone from adding a multiple of
+  // order to s and obtaining a second valid signature for the same message.
+  static const uint8_t kMsg[] = {0x54, 0x65, 0x73, 0x74};
+  static const uint8_t kSig[] = {
+      0x7c, 0x38, 0xe0, 0x26, 0xf2, 0x9e, 0x14, 0xaa, 0xbd, 0x05, 0x9a,
+      0x0f, 0x2d, 0xb8, 0xb0, 0xcd, 0x78, 0x30, 0x40, 0x60, 0x9a, 0x8b,
+      0xe6, 0x84, 0xdb, 0x12, 0xf8, 0x2a, 0x27, 0x77, 0x4a, 0xb0, 0x67,
+      0x65, 0x4b, 0xce, 0x38, 0x32, 0xc2, 0xd7, 0x6f, 0x8f, 0x6f, 0x5d,
+      0xaf, 0xc0, 0x8d, 0x93, 0x39, 0xd4, 0xee, 0xf6, 0x76, 0x57, 0x33,
+      0x36, 0xa5, 0xc5, 0x1e, 0xb6, 0xf9, 0x46, 0xb3, 0x1d,
+  };
+  static const uint8_t kPub[] = {
+      0x7d, 0x4d, 0x0e, 0x7f, 0x61, 0x53, 0xa6, 0x9b, 0x62, 0x42, 0xb5,
+      0x22, 0xab, 0xbe, 0xe6, 0x85, 0xfd, 0xa4, 0x42, 0x0f, 0x88, 0x34,
+      0xb1, 0x08, 0xc3, 0xbd, 0xae, 0x36, 0x9e, 0xf5, 0x49, 0xfa,
+  };
+
+  EXPECT_FALSE(ED25519_verify(kMsg, sizeof(kMsg), kSig, kPub));
+}
+
 TEST(Ed25519Test, KeypairFromSeed) {
   uint8_t public_key1[32], private_key1[64];
   ED25519_keypair(public_key1, private_key1);
diff --git a/src/crypto/curve25519/x25519-x86_64.c b/src/crypto/curve25519/x25519-x86_64.c
deleted file mode 100644
index 41db0bdd..00000000
--- a/src/crypto/curve25519/x25519-x86_64.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2015, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// This code is mostly taken from the ref10 version of Ed25519 in SUPERCOP
-// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
-// public domain but this file has the ISC license just to keep licencing
-// simple.
-//
-// The field functions are shared by Ed25519 and X25519 where possible.
-
-#include <openssl/curve25519.h>
-
-#include <string.h>
-
-#include "../internal.h"
-#include "../../third_party/fiat/internal.h"
-
-
-#if defined(BORINGSSL_X25519_X86_64)
-
-typedef struct { uint64_t v[5]; } fe25519;
-
-// These functions are defined in asm/x25519-x86_64.S
-void x25519_x86_64_work_cswap(fe25519 *, uint64_t);
-void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b);
-void x25519_x86_64_square(fe25519 *out, const fe25519 *a);
-void x25519_x86_64_freeze(fe25519 *);
-void x25519_x86_64_ladderstep(fe25519 *work);
-
-static void fe25519_setint(fe25519 *r, unsigned v) {
-  r->v[0] = v;
-  r->v[1] = 0;
-  r->v[2] = 0;
-  r->v[3] = 0;
-  r->v[4] = 0;
-}
-
-// Assumes input x being reduced below 2^255
-static void fe25519_pack(unsigned char r[32], const fe25519 *x) {
-  fe25519 t;
-  t = *x;
-  x25519_x86_64_freeze(&t);
-
-  r[0] = (uint8_t)(t.v[0] & 0xff);
-  r[1] = (uint8_t)((t.v[0] >> 8) & 0xff);
-  r[2] = (uint8_t)((t.v[0] >> 16) & 0xff);
-  r[3] = (uint8_t)((t.v[0] >> 24) & 0xff);
-  r[4] = (uint8_t)((t.v[0] >> 32) & 0xff);
-  r[5] = (uint8_t)((t.v[0] >> 40) & 0xff);
-  r[6] = (uint8_t)((t.v[0] >> 48));
-
-  r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8);
-  r[7] = (uint8_t)((t.v[1] >> 5) & 0xff);
-  r[8] = (uint8_t)((t.v[1] >> 13) & 0xff);
-  r[9] = (uint8_t)((t.v[1] >> 21) & 0xff);
-  r[10] = (uint8_t)((t.v[1] >> 29) & 0xff);
-  r[11] = (uint8_t)((t.v[1] >> 37) & 0xff);
-  r[12] = (uint8_t)((t.v[1] >> 45));
-
-  r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0);
-  r[13] = (uint8_t)((t.v[2] >> 2) & 0xff);
-  r[14] = (uint8_t)((t.v[2] >> 10) & 0xff);
-  r[15] = (uint8_t)((t.v[2] >> 18) & 0xff);
-  r[16] = (uint8_t)((t.v[2] >> 26) & 0xff);
-  r[17] = (uint8_t)((t.v[2] >> 34) & 0xff);
-  r[18] = (uint8_t)((t.v[2] >> 42) & 0xff);
-  r[19] = (uint8_t)((t.v[2] >> 50));
-
-  r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe);
-  r[20] = (uint8_t)((t.v[3] >> 7) & 0xff);
-  r[21] = (uint8_t)((t.v[3] >> 15) & 0xff);
-  r[22] = (uint8_t)((t.v[3] >> 23) & 0xff);
-  r[23] = (uint8_t)((t.v[3] >> 31) & 0xff);
-  r[24] = (uint8_t)((t.v[3] >> 39) & 0xff);
-  r[25] = (uint8_t)((t.v[3] >> 47));
-
-  r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0);
-  r[26] = (uint8_t)((t.v[4] >> 4) & 0xff);
-  r[27] = (uint8_t)((t.v[4] >> 12) & 0xff);
-  r[28] = (uint8_t)((t.v[4] >> 20) & 0xff);
-  r[29] = (uint8_t)((t.v[4] >> 28) & 0xff);
-  r[30] = (uint8_t)((t.v[4] >> 36) & 0xff);
-  r[31] = (uint8_t)((t.v[4] >> 44));
-}
-
-static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) {
-  r->v[0] = x[0];
-  r->v[0] += (uint64_t)x[1] << 8;
-  r->v[0] += (uint64_t)x[2] << 16;
-  r->v[0] += (uint64_t)x[3] << 24;
-  r->v[0] += (uint64_t)x[4] << 32;
-  r->v[0] += (uint64_t)x[5] << 40;
-  r->v[0] += ((uint64_t)x[6] & 7) << 48;
-
-  r->v[1] = x[6] >> 3;
-  r->v[1] += (uint64_t)x[7] << 5;
-  r->v[1] += (uint64_t)x[8] << 13;
-  r->v[1] += (uint64_t)x[9] << 21;
-  r->v[1] += (uint64_t)x[10] << 29;
-  r->v[1] += (uint64_t)x[11] << 37;
-  r->v[1] += ((uint64_t)x[12] & 63) << 45;
-
-  r->v[2] = x[12] >> 6;
-  r->v[2] += (uint64_t)x[13] << 2;
-  r->v[2] += (uint64_t)x[14] << 10;
-  r->v[2] += (uint64_t)x[15] << 18;
-  r->v[2] += (uint64_t)x[16] << 26;
-  r->v[2] += (uint64_t)x[17] << 34;
-  r->v[2] += (uint64_t)x[18] << 42;
-  r->v[2] += ((uint64_t)x[19] & 1) << 50;
-
-  r->v[3] = x[19] >> 1;
-  r->v[3] += (uint64_t)x[20] << 7;
-  r->v[3] += (uint64_t)x[21] << 15;
-  r->v[3] += (uint64_t)x[22] << 23;
-  r->v[3] += (uint64_t)x[23] << 31;
-  r->v[3] += (uint64_t)x[24] << 39;
-  r->v[3] += ((uint64_t)x[25] & 15) << 47;
-
-  r->v[4] = x[25] >> 4;
-  r->v[4] += (uint64_t)x[26] << 4;
-  r->v[4] += (uint64_t)x[27] << 12;
-  r->v[4] += (uint64_t)x[28] << 20;
-  r->v[4] += (uint64_t)x[29] << 28;
-  r->v[4] += (uint64_t)x[30] << 36;
-  r->v[4] += ((uint64_t)x[31] & 127) << 44;
-}
-
-static void fe25519_invert(fe25519 *r, const fe25519 *x) {
-  fe25519 z2;
-  fe25519 z9;
-  fe25519 z11;
-  fe25519 z2_5_0;
-  fe25519 z2_10_0;
-  fe25519 z2_20_0;
-  fe25519 z2_50_0;
-  fe25519 z2_100_0;
-  fe25519 t;
-  int i;
-
-  /* 2 */ x25519_x86_64_square(&z2, x);
-  /* 4 */ x25519_x86_64_square(&t, &z2);
-  /* 8 */ x25519_x86_64_square(&t, &t);
-  /* 9 */ x25519_x86_64_mul(&z9, &t, x);
-  /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2);
-  /* 22 */ x25519_x86_64_square(&t, &z11);
-  /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9);
-
-  /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0);
-  /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0);
-
-  /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0);
-  /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0);
-
-  /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0);
-  /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0);
-
-  /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t);
-  /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0);
-
-  /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0);
-  /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0);
-
-  /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0);
-  /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) {
-    x25519_x86_64_square(&t, &t);
-  }
-  /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0);
-
-  /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t);
-  /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
-  /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0);
-
-  /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t);
-  /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t);
-  /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t);
-
-  /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t);
-
-  /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t);
-  /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11);
-}
-
-static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) {
-  fe25519 work[5];
-
-  work[0] = *xr;
-  fe25519_setint(work + 1, 1);
-  fe25519_setint(work + 2, 0);
-  work[3] = *xr;
-  fe25519_setint(work + 4, 1);
-
-  int i, j;
-  uint8_t prevbit = 0;
-
-  j = 6;
-  for (i = 31; i >= 0; i--) {
-    while (j >= 0) {
-      const uint8_t bit = 1 & (s[i] >> j);
-      const uint64_t swap = bit ^ prevbit;
-      prevbit = bit;
-      x25519_x86_64_work_cswap(work + 1, swap);
-      x25519_x86_64_ladderstep(work);
-      j -= 1;
-    }
-    j = 7;
-  }
-
-  *xr = work[1];
-  *zr = work[2];
-}
-
-void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
-                  const uint8_t point[32]) {
-  uint8_t e[32];
-  OPENSSL_memcpy(e, scalar, sizeof(e));
-
-  e[0] &= 248;
-  e[31] &= 127;
-  e[31] |= 64;
-
-  fe25519 t;
-  fe25519 z;
-  fe25519_unpack(&t, point);
-  mladder(&t, &z, e);
-  fe25519_invert(&z, &z);
-  x25519_x86_64_mul(&t, &t, &z);
-  fe25519_pack(out, &t);
-}
-
-#endif  // BORINGSSL_X25519_X86_64
diff --git a/src/crypto/dsa/dsa.c b/src/crypto/dsa/dsa.c
index f3d4f859..532ffec9 100644
--- a/src/crypto/dsa/dsa.c
+++ b/src/crypto/dsa/dsa.c
@@ -239,11 +239,6 @@ int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in,
   }
   BN_CTX_start(ctx);
 
-  mont = BN_MONT_CTX_new();
-  if (mont == NULL) {
-    goto err;
-  }
-
   r0 = BN_CTX_get(ctx);
   g = BN_CTX_get(ctx);
   W = BN_CTX_get(ctx);
@@ -401,8 +396,9 @@ end:
     goto err;
   }
 
-  if (!BN_set_word(test, h) ||
-      !BN_MONT_CTX_set(mont, p, ctx)) {
+  mont = BN_MONT_CTX_new_for_modulus(p, ctx);
+  if (mont == NULL ||
+      !BN_set_word(test, h)) {
     goto err;
   }
 
diff --git a/src/crypto/err/ssl.errordata b/src/crypto/err/ssl.errordata
index 44509584..7b63bc8e 100644
--- a/src/crypto/err/ssl.errordata
+++ b/src/crypto/err/ssl.errordata
@@ -117,6 +117,7 @@ SSL,191,PATH_TOO_LONG
 SSL,192,PEER_DID_NOT_RETURN_A_CERTIFICATE
 SSL,193,PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE
 SSL,267,PRE_SHARED_KEY_MUST_BE_LAST
+SSL,287,PRIVATE_KEY_OPERATION_FAILED
 SSL,194,PROTOCOL_IS_SHUTDOWN
 SSL,271,PSK_IDENTITY_BINDER_COUNT_MISMATCH
 SSL,195,PSK_IDENTITY_NOT_FOUND
diff --git a/src/crypto/fipsmodule/bn/add.c b/src/crypto/fipsmodule/bn/add.c
index 201c526d..645e647d 100644
--- a/src/crypto/fipsmodule/bn/add.c
+++ b/src/crypto/fipsmodule/bn/add.c
@@ -199,7 +199,6 @@ int BN_add_word(BIGNUM *a, BN_ULONG w) {
 }
 
 int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
-  int max;
   int add = 0, neg = 0;
   const BIGNUM *tmp;
 
@@ -232,13 +231,6 @@ int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
     return 1;
   }
 
-  // We are actually doing a - b :-)
-
-  max = (a->top > b->top) ? a->top : b->top;
-  if (!bn_wexpand(r, max)) {
-    return 0;
-  }
-
   if (BN_ucmp(a, b) < 0) {
     if (!BN_usub(r, b, a)) {
       return 0;
diff --git a/src/crypto/fipsmodule/bn/bn.c b/src/crypto/fipsmodule/bn/bn.c
index 4be4f21c..520ca27d 100644
--- a/src/crypto/fipsmodule/bn/bn.c
+++ b/src/crypto/fipsmodule/bn/bn.c
@@ -227,13 +227,12 @@ unsigned BN_num_bits_word(BN_ULONG l) {
 }
 
 unsigned BN_num_bits(const BIGNUM *bn) {
-  const int max = bn->top - 1;
-
-  if (BN_is_zero(bn)) {
+  const int width = bn_minimal_width(bn);
+  if (width == 0) {
     return 0;
   }
 
-  return max*BN_BITS2 + BN_num_bits_word(bn->d[max]);
+  return (width - 1) * BN_BITS2 + BN_num_bits_word(bn->d[width - 1]);
 }
 
 unsigned BN_num_bytes(const BIGNUM *bn) {
@@ -298,6 +297,35 @@ int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
   return 1;
 }
 
+int bn_fits_in_words(const BIGNUM *bn, size_t num) {
+  // All words beyond |num| must be zero.
+  BN_ULONG mask = 0;
+  for (size_t i = num; i < (size_t)bn->top; i++) {
+    mask |= bn->d[i];
+  }
+  return mask == 0;
+}
+
+int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn) {
+  if (bn->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  size_t width = (size_t)bn->top;
+  if (width > num) {
+    if (!bn_fits_in_words(bn, num)) {
+      OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+      return 0;
+    }
+    width = num;
+  }
+
+  OPENSSL_memset(out, 0, sizeof(BN_ULONG) * num);
+  OPENSSL_memcpy(out, bn->d, sizeof(BN_ULONG) * width);
+  return 1;
+}
+
 int BN_is_negative(const BIGNUM *bn) {
   return bn->neg != 0;
 }
@@ -350,19 +378,35 @@ int bn_expand(BIGNUM *bn, size_t bits) {
   return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2);
 }
 
-void bn_correct_top(BIGNUM *bn) {
-  BN_ULONG *ftl;
-  int tmp_top = bn->top;
-
-  if (tmp_top > 0) {
-    for (ftl = &(bn->d[tmp_top - 1]); tmp_top > 0; tmp_top--) {
-      if (*(ftl--)) {
-        break;
-      }
+int bn_resize_words(BIGNUM *bn, size_t words) {
+  if ((size_t)bn->top <= words) {
+    if (!bn_wexpand(bn, words)) {
+      return 0;
     }
-    bn->top = tmp_top;
+    OPENSSL_memset(bn->d + bn->top, 0, (words - bn->top) * sizeof(BN_ULONG));
+    bn->top = words;
+    return 1;
+  }
+
+  // All words beyond the new width must be zero.
+  if (!bn_fits_in_words(bn, words)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
   }
+  bn->top = words;
+  return 1;
+}
+
+int bn_minimal_width(const BIGNUM *bn) {
+  int ret = bn->top;
+  while (ret > 0 && bn->d[ret - 1] == 0) {
+    ret--;
+  }
+  return ret;
+}
 
+void bn_correct_top(BIGNUM *bn) {
+  bn->top = bn_minimal_width(bn);
   if (bn->top == 0) {
     bn->neg = 0;
   }
diff --git a/src/crypto/fipsmodule/bn/bn_test.cc b/src/crypto/fipsmodule/bn/bn_test.cc
index ca5f978d..f36656f6 100644
--- a/src/crypto/fipsmodule/bn/bn_test.cc
+++ b/src/crypto/fipsmodule/bn/bn_test.cc
@@ -387,15 +387,15 @@ static void TestSquare(FileTest *t, BN_CTX *ctx) {
   }
 
 #if !defined(BORINGSSL_SHARED_LIBRARY)
-  if (static_cast<size_t>(a->top) <= BN_SMALL_MAX_WORDS) {
-    for (size_t num_a = a->top; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
+  int a_width = bn_minimal_width(a.get());
+  if (a_width <= BN_SMALL_MAX_WORDS) {
+    for (size_t num_a = a_width; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
       SCOPED_TRACE(num_a);
       size_t num_r = 2 * num_a;
       // Use newly-allocated buffers so ASan will catch out-of-bounds writes.
       std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[num_a]),
           r_words(new BN_ULONG[num_r]);
-      OPENSSL_memset(a_words.get(), 0, num_a * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
+      ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
 
       ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
                                a_words.get(), num_a));
@@ -445,22 +445,25 @@ static void TestProduct(FileTest *t, BN_CTX *ctx) {
   }
 
 #if !defined(BORINGSSL_SHARED_LIBRARY)
-  if (!BN_is_negative(product.get()) &&
-      static_cast<size_t>(a->top) <= BN_SMALL_MAX_WORDS &&
-      static_cast<size_t>(b->top) <= BN_SMALL_MAX_WORDS) {
-    for (size_t num_a = a->top; num_a <= BN_SMALL_MAX_WORDS; num_a++) {
+  BN_set_negative(a.get(), 0);
+  BN_set_negative(b.get(), 0);
+  BN_set_negative(product.get(), 0);
+
+  int a_width = bn_minimal_width(a.get());
+  int b_width = bn_minimal_width(b.get());
+  if (a_width <= BN_SMALL_MAX_WORDS && b_width <= BN_SMALL_MAX_WORDS) {
+    for (size_t num_a = static_cast<size_t>(a_width);
+         num_a <= BN_SMALL_MAX_WORDS; num_a++) {
       SCOPED_TRACE(num_a);
-      for (size_t num_b = b->top; num_b <= BN_SMALL_MAX_WORDS; num_b++) {
+      for (size_t num_b = static_cast<size_t>(b_width);
+           num_b <= BN_SMALL_MAX_WORDS; num_b++) {
         SCOPED_TRACE(num_b);
         size_t num_r = num_a + num_b;
         // Use newly-allocated buffers so ASan will catch out-of-bounds writes.
         std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[num_a]),
             b_words(new BN_ULONG[num_b]), r_words(new BN_ULONG[num_r]);
-        OPENSSL_memset(a_words.get(), 0, num_a * sizeof(BN_ULONG));
-        OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-
-        OPENSSL_memset(b_words.get(), 0, num_b * sizeof(BN_ULONG));
-        OPENSSL_memcpy(b_words.get(), b->d, b->top * sizeof(BN_ULONG));
+        ASSERT_TRUE(bn_copy_words(a_words.get(), num_a, a.get()));
+        ASSERT_TRUE(bn_copy_words(b_words.get(), num_b, b.get()));
 
         ASSERT_TRUE(bn_mul_small(r_words.get(), num_r, a_words.get(), num_a,
                                  b_words.get(), num_b));
@@ -537,12 +540,12 @@ static void TestModMul(FileTest *t, BN_CTX *ctx) {
 
   if (BN_is_odd(m.get())) {
     // Reduce |a| and |b| and test the Montgomery version.
-    bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    bssl::UniquePtr<BN_MONT_CTX> mont(
+        BN_MONT_CTX_new_for_modulus(m.get(), ctx));
     bssl::UniquePtr<BIGNUM> a_tmp(BN_new()), b_tmp(BN_new());
     ASSERT_TRUE(mont);
     ASSERT_TRUE(a_tmp);
     ASSERT_TRUE(b_tmp);
-    ASSERT_TRUE(BN_MONT_CTX_set(mont.get(), m.get(), ctx));
     ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
     ASSERT_TRUE(BN_nnmod(b.get(), b.get(), m.get(), ctx));
     ASSERT_TRUE(BN_to_montgomery(a_tmp.get(), a.get(), mont.get(), ctx));
@@ -554,24 +557,23 @@ static void TestModMul(FileTest *t, BN_CTX *ctx) {
                          ret.get());
 
 #if !defined(BORINGSSL_SHARED_LIBRARY)
-    if (m->top <= BN_SMALL_MAX_WORDS) {
-      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m->top]),
-          b_words(new BN_ULONG[m->top]), r_words(new BN_ULONG[m->top]);
-      OPENSSL_memset(a_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-      OPENSSL_memset(b_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(b_words.get(), b->d, b->top * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m->top, a_words.get(),
-                                         m->top, mont.get()));
-      ASSERT_TRUE(bn_to_montgomery_small(b_words.get(), m->top, b_words.get(),
-                                         m->top, mont.get()));
+    size_t m_width = static_cast<size_t>(bn_minimal_width(m.get()));
+    if (m_width <= BN_SMALL_MAX_WORDS) {
+      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m_width]),
+          b_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
+      ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
+      ASSERT_TRUE(bn_copy_words(b_words.get(), m_width, b.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
+                                         m_width, mont.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(b_words.get(), m_width, b_words.get(),
+                                         m_width, mont.get()));
       ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m->top, a_words.get(), m->top, b_words.get(), m->top,
+          r_words.get(), m_width, a_words.get(), m_width, b_words.get(), m_width,
           mont.get()));
       // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width, r_words.get(),
+                                           m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * B (mod M) (Montgomery, words)", mod_mul.get(),
                            ret.get());
     }
@@ -601,11 +603,11 @@ static void TestModSquare(FileTest *t, BN_CTX *ctx) {
 
   if (BN_is_odd(m.get())) {
     // Reduce |a| and test the Montgomery version.
-    bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    bssl::UniquePtr<BN_MONT_CTX> mont(
+        BN_MONT_CTX_new_for_modulus(m.get(), ctx));
     bssl::UniquePtr<BIGNUM> a_tmp(BN_new());
     ASSERT_TRUE(mont);
     ASSERT_TRUE(a_tmp);
-    ASSERT_TRUE(BN_MONT_CTX_set(mont.get(), m.get(), ctx));
     ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
     ASSERT_TRUE(BN_to_montgomery(a_tmp.get(), a.get(), mont.get(), ctx));
     ASSERT_TRUE(BN_mod_mul_montgomery(ret.get(), a_tmp.get(), a_tmp.get(),
@@ -623,32 +625,32 @@ static void TestModSquare(FileTest *t, BN_CTX *ctx) {
                          ret.get());
 
 #if !defined(BORINGSSL_SHARED_LIBRARY)
-    if (m->top <= BN_SMALL_MAX_WORDS) {
-      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m->top]),
-          a_copy_words(new BN_ULONG[m->top]), r_words(new BN_ULONG[m->top]);
-      OPENSSL_memset(a_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m->top, a_words.get(),
-                                         m->top, mont.get()));
+    size_t m_width = static_cast<size_t>(bn_minimal_width(m.get()));
+    if (m_width <= BN_SMALL_MAX_WORDS) {
+      std::unique_ptr<BN_ULONG[]> a_words(new BN_ULONG[m_width]),
+          a_copy_words(new BN_ULONG[m_width]), r_words(new BN_ULONG[m_width]);
+      ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
+                                         m_width, mont.get()));
       ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m->top, a_words.get(), m->top, a_words.get(), m->top,
-          mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+          r_words.get(), m_width, a_words.get(), m_width, a_words.get(),
+          m_width, mont.get()));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
+                                           r_words.get(), m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * A (mod M) (Montgomery, words)",
                            mod_square.get(), ret.get());
 
       // Repeat the operation with |a_copy_words|.
       OPENSSL_memcpy(a_copy_words.get(), a_words.get(),
-                     m->top * sizeof(BN_ULONG));
+                     m_width * sizeof(BN_ULONG));
       ASSERT_TRUE(bn_mod_mul_montgomery_small(
-          r_words.get(), m->top, a_words.get(), m->top, a_copy_words.get(),
-          m->top, mont.get()));
+          r_words.get(), m_width, a_words.get(), m_width, a_copy_words.get(),
+          m_width, mont.get()));
       // Use the second half of |tmp| so ASan will catch out-of-bounds writes.
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
+                                           r_words.get(), m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A * A_copy (mod M) (Montgomery, words)",
                            mod_square.get(), ret.get());
     }
@@ -683,22 +685,22 @@ static void TestModExp(FileTest *t, BN_CTX *ctx) {
                          ret.get());
 
 #if !defined(BORINGSSL_SHARED_LIBRARY)
-    if (m->top <= BN_SMALL_MAX_WORDS) {
-      bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+    size_t m_width = static_cast<size_t>(bn_minimal_width(m.get()));
+    if (m_width <= BN_SMALL_MAX_WORDS) {
+      bssl::UniquePtr<BN_MONT_CTX> mont(
+          BN_MONT_CTX_new_for_modulus(m.get(), ctx));
       ASSERT_TRUE(mont.get());
-      ASSERT_TRUE(BN_MONT_CTX_set(mont.get(), m.get(), ctx));
       ASSERT_TRUE(BN_nnmod(a.get(), a.get(), m.get(), ctx));
-      std::unique_ptr<BN_ULONG[]> r_words(new BN_ULONG[m->top]),
-          a_words(new BN_ULONG[m->top]);
-      OPENSSL_memset(a_words.get(), 0, m->top * sizeof(BN_ULONG));
-      OPENSSL_memcpy(a_words.get(), a->d, a->top * sizeof(BN_ULONG));
-      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m->top, a_words.get(),
-                                         m->top, mont.get()));
-      ASSERT_TRUE(bn_mod_exp_mont_small(r_words.get(), m->top, a_words.get(),
-                                        m->top, e->d, e->top, mont.get()));
-      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m->top, r_words.get(),
-                                           m->top, mont.get()));
-      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m->top));
+      std::unique_ptr<BN_ULONG[]> r_words(new BN_ULONG[m_width]),
+          a_words(new BN_ULONG[m_width]);
+      ASSERT_TRUE(bn_copy_words(a_words.get(), m_width, a.get()));
+      ASSERT_TRUE(bn_to_montgomery_small(a_words.get(), m_width, a_words.get(),
+                                         m_width, mont.get()));
+      ASSERT_TRUE(bn_mod_exp_mont_small(r_words.get(), m_width, a_words.get(),
+                                        m_width, e->d, e->top, mont.get()));
+      ASSERT_TRUE(bn_from_montgomery_small(r_words.get(), m_width,
+                                           r_words.get(), m_width, mont.get()));
+      ASSERT_TRUE(bn_set_words(ret.get(), r_words.get(), m_width));
       EXPECT_BIGNUMS_EQUAL("A ^ E (mod M) (Montgomery, words)", mod_exp.get(),
                            ret.get());
     }
@@ -862,6 +864,17 @@ TEST_F(BNTest, BN2BinPadded) {
     EXPECT_EQ(Bytes(zeros, sizeof(out) - bytes),
               Bytes(out, sizeof(out) - bytes));
     EXPECT_EQ(Bytes(reference, bytes), Bytes(out + sizeof(out) - bytes, bytes));
+
+#if !defined(BORINGSSL_SHARED_LIBRARY)
+    // Repeat some tests with a non-minimal |BIGNUM|.
+    EXPECT_TRUE(bn_resize_words(n.get(), 32));
+
+    EXPECT_FALSE(BN_bn2bin_padded(out, bytes - 1, n.get()));
+
+    ASSERT_TRUE(BN_bn2bin_padded(out, bytes + 1, n.get()));
+    EXPECT_EQ(0u, out[0]);
+    EXPECT_EQ(Bytes(reference, bytes), Bytes(out + 1, bytes));
+#endif
   }
 }
 
@@ -1267,11 +1280,9 @@ TEST_F(BNTest, BadModulus) {
   bssl::UniquePtr<BIGNUM> a(BN_new());
   bssl::UniquePtr<BIGNUM> b(BN_new());
   bssl::UniquePtr<BIGNUM> zero(BN_new());
-  bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
   ASSERT_TRUE(a);
   ASSERT_TRUE(b);
   ASSERT_TRUE(zero);
-  ASSERT_TRUE(mont);
 
   BN_zero(zero.get());
 
@@ -1294,13 +1305,16 @@ TEST_F(BNTest, BadModulus) {
       a.get(), BN_value_one(), BN_value_one(), zero.get(), ctx(), nullptr));
   ERR_clear_error();
 
-  EXPECT_FALSE(BN_MONT_CTX_set(mont.get(), zero.get(), ctx()));
+  bssl::UniquePtr<BN_MONT_CTX> mont(
+      BN_MONT_CTX_new_for_modulus(zero.get(), ctx()));
+  EXPECT_FALSE(mont);
   ERR_clear_error();
 
   // Some operations also may not be used with an even modulus.
   ASSERT_TRUE(BN_set_word(b.get(), 16));
 
-  EXPECT_FALSE(BN_MONT_CTX_set(mont.get(), b.get(), ctx()));
+  mont.reset(BN_MONT_CTX_new_for_modulus(b.get(), ctx()));
+  EXPECT_FALSE(mont);
   ERR_clear_error();
 
   EXPECT_FALSE(BN_mod_exp_mont(a.get(), BN_value_one(), BN_value_one(), b.get(),
@@ -1883,4 +1897,100 @@ TEST_F(BNTest, LessThanWords) {
   EXPECT_EQ(0, bn_less_than_words(NULL, NULL, 0));
   EXPECT_EQ(0, bn_in_range_words(NULL, 0, NULL, 0));
 }
+
+TEST_F(BNTest, NonMinimal) {
+  bssl::UniquePtr<BIGNUM> ten(BN_new());
+  ASSERT_TRUE(ten);
+  ASSERT_TRUE(BN_set_word(ten.get(), 10));
+
+  bssl::UniquePtr<BIGNUM> ten_copy(BN_dup(ten.get()));
+  ASSERT_TRUE(ten_copy);
+
+  bssl::UniquePtr<BIGNUM> eight(BN_new());
+  ASSERT_TRUE(eight);
+  ASSERT_TRUE(BN_set_word(eight.get(), 8));
+
+  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
+  ASSERT_TRUE(forty_two);
+  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
+
+  bssl::UniquePtr<BIGNUM> two_exp_256(BN_new());
+  ASSERT_TRUE(two_exp_256);
+  ASSERT_TRUE(BN_lshift(two_exp_256.get(), BN_value_one(), 256));
+
+  // Check some comparison functions on |ten| before and after expanding.
+  for (size_t width = 1; width < 10; width++) {
+    SCOPED_TRACE(width);
+    // Make a wider version of |ten|.
+    EXPECT_TRUE(bn_resize_words(ten.get(), width));
+    EXPECT_EQ(static_cast<int>(width), ten->top);
+
+    EXPECT_TRUE(BN_abs_is_word(ten.get(), 10));
+    EXPECT_TRUE(BN_is_word(ten.get(), 10));
+    EXPECT_EQ(10u, BN_get_word(ten.get()));
+    uint64_t v;
+    ASSERT_TRUE(BN_get_u64(ten.get(), &v));
+    EXPECT_EQ(10u, v);
+
+    EXPECT_TRUE(BN_equal_consttime(ten.get(), ten_copy.get()));
+    EXPECT_TRUE(BN_equal_consttime(ten_copy.get(), ten.get()));
+    EXPECT_FALSE(BN_less_than_consttime(ten.get(), ten_copy.get()));
+    EXPECT_FALSE(BN_less_than_consttime(ten_copy.get(), ten.get()));
+    EXPECT_EQ(BN_cmp(ten.get(), ten_copy.get()), 0);
+
+    EXPECT_FALSE(BN_equal_consttime(ten.get(), eight.get()));
+    EXPECT_FALSE(BN_less_than_consttime(ten.get(), eight.get()));
+    EXPECT_TRUE(BN_less_than_consttime(eight.get(), ten.get()));
+    EXPECT_LT(BN_cmp(eight.get(), ten.get()), 0);
+
+    EXPECT_FALSE(BN_equal_consttime(ten.get(), forty_two.get()));
+    EXPECT_TRUE(BN_less_than_consttime(ten.get(), forty_two.get()));
+    EXPECT_FALSE(BN_less_than_consttime(forty_two.get(), ten.get()));
+    EXPECT_GT(BN_cmp(forty_two.get(), ten.get()), 0);
+
+    EXPECT_FALSE(BN_equal_consttime(ten.get(), two_exp_256.get()));
+    EXPECT_TRUE(BN_less_than_consttime(ten.get(), two_exp_256.get()));
+    EXPECT_FALSE(BN_less_than_consttime(two_exp_256.get(), ten.get()));
+    EXPECT_GT(BN_cmp(two_exp_256.get(), ten.get()), 0);
+
+    EXPECT_EQ(4u, BN_num_bits(ten.get()));
+    EXPECT_EQ(1u, BN_num_bytes(ten.get()));
+    EXPECT_FALSE(BN_is_pow2(ten.get()));
+  }
+
+  // |ten| may be resized back down to one word.
+  EXPECT_TRUE(bn_resize_words(ten.get(), 1));
+  EXPECT_EQ(1, ten->top);
+
+  // But not to zero words, which it does not fit.
+  EXPECT_FALSE(bn_resize_words(ten.get(), 0));
+
+  EXPECT_TRUE(BN_is_pow2(eight.get()));
+  EXPECT_TRUE(bn_resize_words(eight.get(), 4));
+  EXPECT_EQ(4, eight->top);
+  EXPECT_TRUE(BN_is_pow2(eight.get()));
+
+  // |BN_MONT_CTX| is always stored minimally and uses the same R independent of
+  // input width.
+  static const uint8_t kP[] = {
+      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+  bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
+  ASSERT_TRUE(p);
+
+  bssl::UniquePtr<BN_MONT_CTX> mont(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont);
+
+  ASSERT_TRUE(bn_resize_words(p.get(), 32));
+  bssl::UniquePtr<BN_MONT_CTX> mont2(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx()));
+  ASSERT_TRUE(mont2);
+
+  EXPECT_EQ(mont->N.top, mont2->N.top);
+  EXPECT_EQ(0, BN_cmp(&mont->RR, &mont2->RR));
+}
+
 #endif  // !BORINGSSL_SHARED_LIBRARY
diff --git a/src/crypto/fipsmodule/bn/bytes.c b/src/crypto/fipsmodule/bn/bytes.c
index 328d56e7..aa654835 100644
--- a/src/crypto/fipsmodule/bn/bytes.c
+++ b/src/crypto/fipsmodule/bn/bytes.c
@@ -159,22 +159,9 @@ size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
   return n;
 }
 
-int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
-  // If we don't have enough space, fail out.
-  size_t num_bytes = BN_num_bytes(in);
-  if (len < num_bytes) {
-    return 0;
-  }
-
-  // We only support little-endian platforms, so we can simply memcpy into the
-  // internal representation.
-  OPENSSL_memcpy(out, in->d, num_bytes);
-
-  // Pad out the rest of the buffer with zeroes.
-  OPENSSL_memset(out + num_bytes, 0, len - num_bytes);
-
-  return 1;
-}
+// TODO(davidben): This does not need to be quite so complex once the |BIGNUM|s
+// we care about are fixed-width. |read_word_padded| is a hack to paper over
+// parts of the |bn_correct_top| leak. Fix that, and this can be simpler.
 
 // constant_time_select_ulong returns |x| if |v| is 1 and |y| if |v| is 0. Its
 // behavior is undefined if |v| takes any other value.
@@ -197,6 +184,10 @@ static int constant_time_le_size_t(size_t x, size_t y) {
 // the access would be out of bounds, it reads the last word of |in|. |in| must
 // not be zero.
 static BN_ULONG read_word_padded(const BIGNUM *in, size_t i) {
+  if (in->dmax == 0) {
+    return 0;
+  }
+
   // Read |in->d[i]| if valid. Otherwise, read the last word.
   BN_ULONG l = in->d[constant_time_select_ulong(
       constant_time_le_size_t(in->dmax, i), in->dmax - 1, i)];
@@ -205,24 +196,45 @@ static BN_ULONG read_word_padded(const BIGNUM *in, size_t i) {
   return constant_time_select_ulong(constant_time_le_size_t(in->top, i), 0, l);
 }
 
-int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
-  // Special case for |in| = 0. Just branch as the probability is negligible.
-  if (BN_is_zero(in)) {
-    OPENSSL_memset(out, 0, len);
-    return 1;
+static int fits_in_bytes(const BIGNUM *in, size_t len) {
+  BN_ULONG mask = 0;
+  for (size_t i = (len + (BN_BYTES - 1)) / BN_BYTES; i < (size_t)in->top; i++) {
+    mask |= in->d[i];
   }
+  if ((len % BN_BYTES) != 0) {
+    BN_ULONG l = read_word_padded(in, len / BN_BYTES);
+    mask |= l >> (8 * (len % BN_BYTES));
+  }
+  return mask == 0;
+}
 
+int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
+  // If we don't have enough space, fail out.
+  if (!fits_in_bytes(in, len)) {
+    return 0;
+  }
+
+  size_t todo = in->top * BN_BYTES;
+  if (todo > len) {
+    todo = len;
+  }
+
+  // We only support little-endian platforms, so we can simply memcpy into the
+  // internal representation.
+  OPENSSL_memcpy(out, in->d, todo);
+
+  // Pad out the rest of the buffer with zeroes.
+  OPENSSL_memset(out + todo, 0, len - todo);
+
+  return 1;
+}
+
+int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
   // Check if the integer is too big. This case can exit early in non-constant
   // time.
-  if ((size_t)in->top > (len + (BN_BYTES - 1)) / BN_BYTES) {
+  if (!fits_in_bytes(in, len)) {
     return 0;
   }
-  if ((len % BN_BYTES) != 0) {
-    BN_ULONG l = read_word_padded(in, len / BN_BYTES);
-    if (l >> (8 * (len % BN_BYTES)) != 0) {
-      return 0;
-    }
-  }
 
   // Write the bytes out one by one. Serialization is done without branching on
   // the bits of |in| or on |in->top|, but if the routine would otherwise read
@@ -240,7 +252,7 @@ int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
 }
 
 BN_ULONG BN_get_word(const BIGNUM *bn) {
-  switch (bn->top) {
+  switch (bn_minimal_width(bn)) {
     case 0:
       return 0;
     case 1:
@@ -251,7 +263,7 @@ BN_ULONG BN_get_word(const BIGNUM *bn) {
 }
 
 int BN_get_u64(const BIGNUM *bn, uint64_t *out) {
-  switch (bn->top) {
+  switch (bn_minimal_width(bn)) {
     case 0:
       *out = 0;
       return 1;
diff --git a/src/crypto/fipsmodule/bn/cmp.c b/src/crypto/fipsmodule/bn/cmp.c
index acc017ff..265c8526 100644
--- a/src/crypto/fipsmodule/bn/cmp.c
+++ b/src/crypto/fipsmodule/bn/cmp.c
@@ -64,19 +64,18 @@
 
 
 int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
-  int i;
-  BN_ULONG t1, t2, *ap, *bp;
-
-  i = a->top - b->top;
+  int a_width = bn_minimal_width(a);
+  int b_width = bn_minimal_width(b);
+  int i = a_width - b_width;
   if (i != 0) {
     return i;
   }
 
-  ap = a->d;
-  bp = b->d;
-  for (i = a->top - 1; i >= 0; i--) {
-    t1 = ap[i];
-    t2 = bp[i];
+  const BN_ULONG *ap = a->d;
+  const BN_ULONG *bp = b->d;
+  for (i = a_width - 1; i >= 0; i--) {
+    BN_ULONG t1 = ap[i];
+    BN_ULONG t2 = bp[i];
     if (t1 != t2) {
       return (t1 > t2) ? 1 : -1;
     }
@@ -114,14 +113,16 @@ int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
     lt = 1;
   }
 
-  if (a->top > b->top) {
+  int a_width = bn_minimal_width(a);
+  int b_width = bn_minimal_width(b);
+  if (a_width > b_width) {
     return gt;
   }
-  if (a->top < b->top) {
+  if (a_width < b_width) {
     return lt;
   }
 
-  for (i = a->top - 1; i >= 0; i--) {
+  for (i = a_width - 1; i >= 0; i--) {
     t1 = a->d[i];
     t2 = b->d[i];
     if (t1 > t2) {
@@ -176,21 +177,43 @@ int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl) {
   return bn_cmp_words(a, b, cl);
 }
 
-int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) {
+static int bn_less_than_words_impl(const BN_ULONG *a, size_t a_len,
+                                   const BN_ULONG *b, size_t b_len) {
   OPENSSL_COMPILE_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
                          crypto_word_t_too_small);
   int ret = 0;
-  // Process the words in little-endian order.
-  for (size_t i = 0; i < len; i++) {
+  // Process the common words in little-endian order.
+  size_t min = a_len < b_len ? a_len : b_len;
+  for (size_t i = 0; i < min; i++) {
     crypto_word_t eq = constant_time_eq_w(a[i], b[i]);
     crypto_word_t lt = constant_time_lt_w(a[i], b[i]);
     ret = constant_time_select_int(eq, ret, constant_time_select_int(lt, 1, 0));
   }
+
+  // If |a| or |b| has non-zero words beyond |min|, they take precedence.
+  if (a_len < b_len) {
+    crypto_word_t mask = 0;
+    for (size_t i = a_len; i < b_len; i++) {
+      mask |= b[i];
+    }
+    ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 1);
+  } else if (b_len < a_len) {
+    crypto_word_t mask = 0;
+    for (size_t i = b_len; i < a_len; i++) {
+      mask |= a[i];
+    }
+    ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 0);
+  }
+
   return ret;
 }
 
+int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) {
+  return bn_less_than_words_impl(a, len, b, len);
+}
+
 int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) {
-  switch (bn->top) {
+  switch (bn_minimal_width(bn)) {
     case 1:
       return bn->d[0] == w;
     case 0:
@@ -212,7 +235,7 @@ int BN_cmp_word(const BIGNUM *a, BN_ULONG b) {
 }
 
 int BN_is_zero(const BIGNUM *bn) {
-  return bn->top == 0;
+  return bn_minimal_width(bn) == 0;
 }
 
 int BN_is_one(const BIGNUM *bn) {
@@ -228,27 +251,52 @@ int BN_is_odd(const BIGNUM *bn) {
 }
 
 int BN_is_pow2(const BIGNUM *bn) {
-  if (bn->top == 0 || bn->neg) {
+  int width = bn_minimal_width(bn);
+  if (width == 0 || bn->neg) {
     return 0;
   }
 
-  for (int i = 0; i < bn->top - 1; i++) {
+  for (int i = 0; i < width - 1; i++) {
     if (bn->d[i] != 0) {
       return 0;
     }
   }
 
-  return 0 == (bn->d[bn->top-1] & (bn->d[bn->top-1] - 1));
+  return 0 == (bn->d[width-1] & (bn->d[width-1] - 1));
 }
 
 int BN_equal_consttime(const BIGNUM *a, const BIGNUM *b) {
-  if (a->top != b->top) {
-    return 0;
+  BN_ULONG mask = 0;
+  // If |a| or |b| has more words than the other, all those words must be zero.
+  for (int i = a->top; i < b->top; i++) {
+    mask |= b->d[i];
   }
+  for (int i = b->top; i < a->top; i++) {
+    mask |= a->d[i];
+  }
+  // Common words must match.
+  int min = a->top < b->top ? a->top : b->top;
+  for (int i = 0; i < min; i++) {
+    mask |= (a->d[i] ^ b->d[i]);
+  }
+  // The sign bit must match.
+  mask |= (a->neg ^ b->neg);
+  return mask == 0;
+}
 
-  int limbs_are_equal =
-    CRYPTO_memcmp(a->d, b->d, (size_t)a->top * sizeof(a->d[0])) == 0;
-
-  return constant_time_select_int(constant_time_eq_int(a->neg, b->neg),
-                                  limbs_are_equal, 0);
+int BN_less_than_consttime(const BIGNUM *a, const BIGNUM *b) {
+  // We do not attempt to process the sign bit in constant time. Negative
+  // |BIGNUM|s should never occur in crypto, only calculators.
+  if (a->neg && !b->neg) {
+    return 1;
+  }
+  if (b->neg && !a->neg) {
+    return 0;
+  }
+  if (a->neg && b->neg) {
+    const BIGNUM *tmp = a;
+    a = b;
+    b = tmp;
+  }
+  return bn_less_than_words_impl(a->d, a->top, b->d, b->top);
 }
diff --git a/src/crypto/fipsmodule/bn/exponentiation.c b/src/crypto/fipsmodule/bn/exponentiation.c
index 63c1c050..9e0ddfbb 100644
--- a/src/crypto/fipsmodule/bn/exponentiation.c
+++ b/src/crypto/fipsmodule/bn/exponentiation.c
@@ -622,8 +622,8 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new();
-    if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
+    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    if (new_mont == NULL) {
       goto err;
     }
     mont = new_mont;
@@ -666,22 +666,7 @@ int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
     }
   }
 
-  // Set |r| to one in Montgomery form. If the high bit of |m| is set, |m| is
-  // close to R and we subtract rather than perform Montgomery reduction.
-  if (m->d[m->top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
-    if (!bn_wexpand(r, m->top)) {
-      goto err;
-    }
-    // r = 2^(top*BN_BITS2) - m
-    r->d[0] = 0 - m->d[0];
-    for (int i = 1; i < m->top; i++) {
-      r->d[i] = ~m->d[i];
-    }
-    r->top = m->top;
-    // The upper words will be zero if the corresponding words of |m| were
-    // 0xfff[...], so call |bn_correct_top|.
-    bn_correct_top(r);
-  } else if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
+  if (!bn_one_to_montgomery(r, mont, ctx)) {
     goto err;
   }
 
@@ -746,7 +731,6 @@ err:
 int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                           size_t num_a, const BN_ULONG *p, size_t num_p,
                           const BN_MONT_CTX *mont) {
-  const BN_ULONG *n = mont->N.d;
   size_t num_n = mont->N.top;
   if (num_n != num_a || num_n != num_r || num_n > BN_SMALL_MAX_WORDS) {
     OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
@@ -793,16 +777,7 @@ int bn_mod_exp_mont_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
     }
   }
 
-  // Set |r| to one in Montgomery form. If the high bit of |m| is set, |m| is
-  // close to R and we subtract rather than perform Montgomery reduction.
-  if (n[num_n - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
-    // r = 2^(top*BN_BITS2) - m
-    r[0] = 0 - n[0];
-    for (size_t i = 1; i < num_n; i++) {
-      r[i] = ~n[i];
-    }
-  } else if (!bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.top,
-                                       mont)) {
+  if (!bn_one_to_montgomery_small(r, num_r, mont)) {
     goto err;
   }
 
@@ -1039,8 +1014,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new();
-    if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
+    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    if (new_mont == NULL) {
       goto err;
     }
     mont = new_mont;
@@ -1118,16 +1093,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
   tmp.neg = am.neg = 0;
   tmp.flags = am.flags = BN_FLG_STATIC_DATA;
 
-// prepare a^0 in Montgomery domain
-// by Shay Gueron's suggestion
-  if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
-    // 2^(top*BN_BITS2) - m
-    tmp.d[0] = 0 - m->d[0];
-    for (i = 1; i < top; i++) {
-      tmp.d[i] = ~m->d[i];
-    }
-    tmp.top = top;
-  } else if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx)) {
+  if (!bn_one_to_montgomery(&tmp, mont, ctx)) {
     goto err;
   }
 
@@ -1365,8 +1331,8 @@ int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
 
   // Allocate a montgomery context if it was not supplied by the caller.
   if (mont == NULL) {
-    new_mont = BN_MONT_CTX_new();
-    if (new_mont == NULL || !BN_MONT_CTX_set(new_mont, m, ctx)) {
+    new_mont = BN_MONT_CTX_new_for_modulus(m, ctx);
+    if (new_mont == NULL) {
       goto err;
     }
     mont = new_mont;
diff --git a/src/crypto/fipsmodule/bn/internal.h b/src/crypto/fipsmodule/bn/internal.h
index 706e544d..f3b8d8ad 100644
--- a/src/crypto/fipsmodule/bn/internal.h
+++ b/src/crypto/fipsmodule/bn/internal.h
@@ -197,8 +197,12 @@ extern "C" {
 #define Hw(t) ((BN_ULONG)((t) >> BN_BITS2))
 #endif
 
-// bn_correct_top decrements |bn->top| until |bn->d[top-1]| is non-zero or
-// until |top| is zero. If |bn| is zero, |bn->neg| is set to zero.
+// bn_minimal_width returns the minimal value of |bn->top| which fits the
+// value of |bn|.
+int bn_minimal_width(const BIGNUM *bn);
+
+// bn_correct_top decrements |bn->top| to |bn_minimal_width|. If |bn| is zero,
+// |bn->neg| is set to zero.
 void bn_correct_top(BIGNUM *bn);
 
 // bn_wexpand ensures that |bn| has at least |words| works of space without
@@ -210,10 +214,26 @@ int bn_wexpand(BIGNUM *bn, size_t words);
 // than a number of words.
 int bn_expand(BIGNUM *bn, size_t bits);
 
+// bn_resize_words adjusts |bn->top| to be |words|. It returns one on success
+// and zero on allocation error or if |bn|'s value is too large.
+//
+// Do not call this function outside of unit tests. Most functions currently
+// require |BIGNUM|s be minimal. This function breaks that invariant. It is
+// introduced early so the invariant may be relaxed incrementally.
+int bn_resize_words(BIGNUM *bn, size_t words);
+
 // bn_set_words sets |bn| to the value encoded in the |num| words in |words|,
 // least significant word first.
 int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
 
+// bn_fits_in_words returns one if |bn| may be represented in |num| words, plus
+// a sign bit, and zero otherwise.
+int bn_fits_in_words(const BIGNUM *bn, size_t num);
+
+// bn_copy_words copies the value of |bn| to |out| and returns one if the value
+// is representable in |num| words. Otherwise, it returns zero.
+int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn);
+
 // bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places
 // the result in |rp|. |ap| and |rp| must both be |num| words long. It returns
 // the carry word of the operation. |ap| and |rp| may be equal but otherwise may
@@ -326,6 +346,15 @@ int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
 // otherwise.
 int bn_is_bit_set_words(const BN_ULONG *a, size_t num, unsigned bit);
 
+// bn_one_to_montgomery sets |r| to one in Montgomery form. It returns one on
+// success and zero on error. This function treats the bit width of the modulus
+// as public.
+int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx);
+
+// bn_less_than_montgomery_R returns one if |bn| is less than the Montgomery R
+// value for |mont| and zero otherwise.
+int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont);
+
 
 // Low-level operations for small numbers.
 //
@@ -372,6 +401,13 @@ int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
 int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                              size_t num_a, const BN_MONT_CTX *mont);
 
+// bn_one_to_montgomery_small sets |r| to one in Montgomery form. It returns one
+// on success and zero on error. |num_r| must be the length of the modulus,
+// which is |mont->N.top|. This function treats the bit width of the modulus as
+// public.
+int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
+                               const BN_MONT_CTX *mont);
+
 // bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
 // and outputs are in the Montgomery domain. |num_r| must be the length of the
 // modulus, which is |mont->N.top|. This function returns one on success and
diff --git a/src/crypto/fipsmodule/bn/montgomery.c b/src/crypto/fipsmodule/bn/montgomery.c
index e8505dae..a51725c7 100644
--- a/src/crypto/fipsmodule/bn/montgomery.c
+++ b/src/crypto/fipsmodule/bn/montgomery.c
@@ -126,10 +126,6 @@
 #define OPENSSL_BN_ASM_MONT
 #endif
 
-static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
-                                          const BIGNUM *b,
-                                          const BN_MONT_CTX *mont, BN_CTX *ctx);
-
 
 BN_MONT_CTX *BN_MONT_CTX_new(void) {
   BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
@@ -193,6 +189,10 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
     OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
     return 0;
   }
+  // |mont->N| is always stored minimally. Computing RR efficiently leaks the
+  // size of the modulus. While the modulus may be private in RSA (one of the
+  // primes), their sizes are public, so this is fine.
+  bn_correct_top(&mont->N);
 
   // Find n0 such that n0 * N == -1 (mod r).
   //
@@ -200,7 +200,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
   // others, we could use a shorter R value and use faster |BN_ULONG|-based
   // math instead of |uint64_t|-based math, which would be double-precision.
   // However, currently only the assembler files know which is which.
-  uint64_t n0 = bn_mont_n0(mod);
+  uint64_t n0 = bn_mont_n0(&mont->N);
   mont->n0[0] = (BN_ULONG)n0;
 #if BN_MONT_CTX_N0_LIMBS == 2
   mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2);
@@ -215,7 +215,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
   // as |BN_MONT_CTX_N0_LIMBS| is either one or two.
   //
   // XXX: This is not constant time with respect to |mont->N|, but it should be.
-  unsigned lgBigR = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
+  unsigned lgBigR = mont->N.top * BN_BITS2;
   if (!bn_mod_exp_base_2_vartime(&mont->RR, lgBigR * 2, &mont->N)) {
     return 0;
   }
@@ -223,6 +223,16 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
   return 1;
 }
 
+BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !BN_MONT_CTX_set(mont, mod, ctx)) {
+    BN_MONT_CTX_free(mont);
+    return NULL;
+  }
+  return mont;
+}
+
 int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
                            const BIGNUM *mod, BN_CTX *bn_ctx) {
   CRYPTO_MUTEX_lock_read(lock);
@@ -234,25 +244,12 @@ int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
   }
 
   CRYPTO_MUTEX_lock_write(lock);
-  ctx = *pmont;
-  if (ctx) {
-    goto out;
-  }
-
-  ctx = BN_MONT_CTX_new();
-  if (ctx == NULL) {
-    goto out;
+  if (*pmont == NULL) {
+    *pmont = BN_MONT_CTX_new_for_modulus(mod, bn_ctx);
   }
-  if (!BN_MONT_CTX_set(ctx, mod, bn_ctx)) {
-    BN_MONT_CTX_free(ctx);
-    ctx = NULL;
-    goto out;
-  }
-  *pmont = ctx;
-
-out:
+  const int ok = *pmont != NULL;
   CRYPTO_MUTEX_unlock_write(lock);
-  return ctx != NULL;
+  return ok;
 }
 
 int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
@@ -304,6 +301,11 @@ static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a,
 
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
                                    const BN_MONT_CTX *mont) {
+  if (r->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
   const BIGNUM *n = &mont->N;
   if (n->top == 0) {
     ret->top = 0;
@@ -311,21 +313,16 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
   }
 
   int max = (2 * n->top);  // carry is stored separately
-  if (!bn_wexpand(r, max) ||
+  if (!bn_resize_words(r, max) ||
       !bn_wexpand(ret, n->top)) {
     return 0;
   }
-  // Clear the top words of |r|.
-  if (max > r->top) {
-    OPENSSL_memset(r->d + r->top, 0, (max - r->top) * sizeof(BN_ULONG));
-  }
-  r->top = max;
   ret->top = n->top;
 
   if (!bn_from_montgomery_in_place(ret->d, ret->top, r->d, r->top, mont)) {
     return 0;
   }
-  ret->neg = r->neg;
+  ret->neg = 0;
 
   bn_correct_top(r);
   bn_correct_top(ret);
@@ -352,35 +349,27 @@ err:
   return ret;
 }
 
-int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
-                          const BN_MONT_CTX *mont, BN_CTX *ctx) {
-#if !defined(OPENSSL_BN_ASM_MONT)
-  return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
-#else
-  int num = mont->N.top;
-
-  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
-  if (num < (128 / BN_BITS2) ||
-      a->top != num ||
-      b->top != num) {
-    return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
-  }
-
-  if (!bn_wexpand(r, num)) {
-    return 0;
-  }
-  if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
-    // The check above ensures this won't happen.
-    assert(0);
-    OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
-    return 0;
+int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  // If the high bit of |n| is set, R = 2^(top*BN_BITS2) < 2 * |n|, so we
+  // compute R - |n| rather than perform Montgomery reduction.
+  const BIGNUM *n = &mont->N;
+  if (n->top > 0 && (n->d[n->top - 1] >> (BN_BITS2 - 1)) != 0) {
+    if (!bn_wexpand(r, n->top)) {
+      return 0;
+    }
+    r->d[0] = 0 - n->d[0];
+    for (int i = 1; i < n->top; i++) {
+      r->d[i] = ~n->d[i];
+    }
+    r->top = n->top;
+    r->neg = 0;
+    // The upper words will be zero if the corresponding words of |n| were
+    // 0xfff[...], so call |bn_correct_top|.
+    bn_correct_top(r);
+    return 1;
   }
-  r->neg = a->neg ^ b->neg;
-  r->top = num;
-  bn_correct_top(r);
 
-  return 1;
-#endif
+  return BN_from_montgomery(r, &mont->RR, mont, ctx);
 }
 
 static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
@@ -417,6 +406,44 @@ err:
   return ret;
 }
 
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  if (a->neg || b->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+#if defined(OPENSSL_BN_ASM_MONT)
+  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+  int num = mont->N.top;
+  if (num >= (128 / BN_BITS2) &&
+      a->top == num &&
+      b->top == num) {
+    if (!bn_wexpand(r, num)) {
+      return 0;
+    }
+    if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+      // The check above ensures this won't happen.
+      assert(0);
+      OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
+      return 0;
+    }
+    r->neg = 0;
+    r->top = num;
+    bn_correct_top(r);
+
+    return 1;
+  }
+#endif
+
+  return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
+}
+
+int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont) {
+  return !BN_is_negative(bn) &&
+         bn_fits_in_words(bn, mont->N.top);
+}
+
 int bn_to_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                            size_t num_a, const BN_MONT_CTX *mont) {
   return bn_mod_mul_montgomery_small(r, num_r, a, num_a, mont->RR.d,
@@ -439,6 +466,28 @@ int bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
   return ret;
 }
 
+int bn_one_to_montgomery_small(BN_ULONG *r, size_t num_r,
+                               const BN_MONT_CTX *mont) {
+  const BN_ULONG *n = mont->N.d;
+  size_t num_n = mont->N.top;
+  if (num_n == 0 || num_r != num_n) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  // If the high bit of |n| is set, R = 2^(num_n*BN_BITS2) < 2 * |n|, so we
+  // compute R - |n| rather than perform Montgomery reduction.
+  if (num_n > 0 && (n[num_n - 1] >> (BN_BITS2 - 1)) != 0) {
+    r[0] = 0 - n[0];
+    for (size_t i = 1; i < num_n; i++) {
+      r[i] = ~n[i];
+    }
+    return 1;
+  }
+
+  return bn_from_montgomery_small(r, num_r, mont->RR.d, mont->RR.top, mont);
+}
+
 int bn_mod_mul_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
                                 size_t num_a, const BN_ULONG *b, size_t num_b,
                                 const BN_MONT_CTX *mont) {
diff --git a/src/crypto/fipsmodule/bn/prime.c b/src/crypto/fipsmodule/bn/prime.c
index 691d0cba..a291f7a0 100644
--- a/src/crypto/fipsmodule/bn/prime.c
+++ b/src/crypto/fipsmodule/bn/prime.c
@@ -586,9 +586,8 @@ int BN_enhanced_miller_rabin_primality_test(
   }
 
   // Montgomery setup for computations mod A
-  mont = BN_MONT_CTX_new();
-  if (mont == NULL ||
-      !BN_MONT_CTX_set(mont, w, ctx)) {
+  mont = BN_MONT_CTX_new_for_modulus(w, ctx);
+  if (mont == NULL) {
     goto err;
   }
 
diff --git a/src/crypto/fipsmodule/ec/ec.c b/src/crypto/fipsmodule/ec/ec.c
index c9687a61..616df162 100644
--- a/src/crypto/fipsmodule/ec/ec.c
+++ b/src/crypto/fipsmodule/ec/ec.c
@@ -389,9 +389,8 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
   }
 
   BN_MONT_CTX_free(group->order_mont);
-  group->order_mont = BN_MONT_CTX_new();
-  if (group->order_mont == NULL ||
-      !BN_MONT_CTX_set(group->order_mont, &group->order, NULL)) {
+  group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, NULL);
+  if (group->order_mont == NULL) {
     return 0;
   }
 
@@ -448,9 +447,8 @@ static EC_GROUP *ec_group_new_from_data(const struct built_in_curve *curve) {
     goto err;
   }
 
-  group->order_mont = BN_MONT_CTX_new();
-  if (group->order_mont == NULL ||
-      !BN_MONT_CTX_set(group->order_mont, &group->order, ctx)) {
+  group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, ctx);
+  if (group->order_mont == NULL) {
     OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
     goto err;
   }
@@ -768,6 +766,9 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
   }
 
   if (!EC_POINT_is_on_curve(group, point, ctx)) {
+    // In the event of an error, defend against the caller not checking the
+    // return value by setting a known safe value: the base point.
+    EC_POINT_copy(point, EC_GROUP_get0_generator(group));
     OPENSSL_PUT_ERROR(EC, EC_R_POINT_IS_NOT_ON_CURVE);
     return 0;
   }
@@ -952,12 +953,10 @@ int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out,
 
 int ec_bignum_to_scalar_unchecked(const EC_GROUP *group, EC_SCALAR *out,
                                   const BIGNUM *in) {
-  if (BN_is_negative(in) || in->top > group->order.top) {
+  if (!bn_copy_words(out->words, group->order.top, in)) {
     OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR);
     return 0;
   }
-  OPENSSL_memset(out->words, 0, group->order.top * sizeof(BN_ULONG));
-  OPENSSL_memcpy(out->words, in->d, in->top * sizeof(BN_ULONG));
   return 1;
 }
 
diff --git a/src/crypto/fipsmodule/ec/ec_montgomery.c b/src/crypto/fipsmodule/ec/ec_montgomery.c
index 898cf07a..165c06f1 100644
--- a/src/crypto/fipsmodule/ec/ec_montgomery.c
+++ b/src/crypto/fipsmodule/ec/ec_montgomery.c
@@ -93,7 +93,6 @@ void ec_GFp_mont_group_finish(EC_GROUP *group) {
 int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
                                 const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
   BN_CTX *new_ctx = NULL;
-  BN_MONT_CTX *mont = NULL;
   int ret = 0;
 
   BN_MONT_CTX_free(group->mont);
@@ -106,18 +105,12 @@ int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
     }
   }
 
-  mont = BN_MONT_CTX_new();
-  if (mont == NULL) {
-    goto err;
-  }
-  if (!BN_MONT_CTX_set(mont, p, ctx)) {
+  group->mont = BN_MONT_CTX_new_for_modulus(p, ctx);
+  if (group->mont == NULL) {
     OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB);
     goto err;
   }
 
-  group->mont = mont;
-  mont = NULL;
-
   ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
 
   if (!ret) {
@@ -127,7 +120,6 @@ int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p,
 
 err:
   BN_CTX_free(new_ctx);
-  BN_MONT_CTX_free(mont);
   return ret;
 }
 
diff --git a/src/crypto/fipsmodule/ec/ec_test.cc b/src/crypto/fipsmodule/ec/ec_test.cc
index e69f8d72..8a215e91 100644
--- a/src/crypto/fipsmodule/ec/ec_test.cc
+++ b/src/crypto/fipsmodule/ec/ec_test.cc
@@ -28,6 +28,7 @@
 #include <openssl/nid.h>
 #include <openssl/obj.h>
 
+#include "../bn/internal.h"
 #include "../../test/test_util.h"
 
 
@@ -553,6 +554,32 @@ TEST_P(ECCurveTest, Mul) {
   EXPECT_EQ(0, EC_POINT_cmp(group.get(), result.get(), generator, nullptr));
 }
 
+#if !defined(BORINGSSL_SHARED_LIBRARY)
+TEST_P(ECCurveTest, MulNonMinimal) {
+  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
+  ASSERT_TRUE(group);
+
+  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
+  ASSERT_TRUE(forty_two);
+  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
+
+  // Compute g × 42.
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  ASSERT_TRUE(point);
+  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), forty_two.get(), nullptr,
+                           nullptr, nullptr));
+
+  // Compute it again with a non-minimal 42, much larger than the scalar.
+  ASSERT_TRUE(bn_resize_words(forty_two.get(), 64));
+
+  bssl::UniquePtr<EC_POINT> point2(EC_POINT_new(group.get()));
+  ASSERT_TRUE(point2);
+  ASSERT_TRUE(EC_POINT_mul(group.get(), point2.get(), forty_two.get(), nullptr,
+                           nullptr, nullptr));
+  EXPECT_EQ(0, EC_POINT_cmp(group.get(), point.get(), point2.get(), nullptr));
+}
+#endif  // BORINGSSL_SHARED_LIBRARY
+
 // Test that EC_KEY_set_private_key rejects invalid values.
 TEST_P(ECCurveTest, SetInvalidPrivateKey) {
   bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid));
@@ -572,6 +599,43 @@ TEST_P(ECCurveTest, SetInvalidPrivateKey) {
   ERR_clear_error();
 }
 
+TEST_P(ECCurveTest, IgnoreOct2PointReturnValue) {
+  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
+  ASSERT_TRUE(group);
+
+  bssl::UniquePtr<BIGNUM> forty_two(BN_new());
+  ASSERT_TRUE(forty_two);
+  ASSERT_TRUE(BN_set_word(forty_two.get(), 42));
+
+  // Compute g × 42.
+  bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  ASSERT_TRUE(point);
+  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), forty_two.get(), nullptr,
+                           nullptr, nullptr));
+
+  // Serialize the point.
+  size_t serialized_len =
+      EC_POINT_point2oct(group.get(), point.get(),
+                         POINT_CONVERSION_UNCOMPRESSED, nullptr, 0, nullptr);
+  ASSERT_NE(0u, serialized_len);
+
+  std::vector<uint8_t> serialized(serialized_len);
+  ASSERT_EQ(serialized_len,
+            EC_POINT_point2oct(group.get(), point.get(),
+                               POINT_CONVERSION_UNCOMPRESSED, serialized.data(),
+                               serialized_len, nullptr));
+
+  // Create a serialized point that is not on the curve.
+  serialized[serialized_len - 1]++;
+
+  ASSERT_FALSE(EC_POINT_oct2point(group.get(), point.get(), serialized.data(),
+                                  serialized.size(), nullptr));
+  // After a failure, |point| should have been set to the generator to defend
+  // against code that doesn't check the return value.
+  ASSERT_EQ(0, EC_POINT_cmp(group.get(), point.get(),
+                            EC_GROUP_get0_generator(group.get()), nullptr));
+}
+
 static std::vector<EC_builtin_curve> AllCurves() {
   const size_t num_curves = EC_get_builtin_curves(nullptr, 0);
   std::vector<EC_builtin_curve> curves(num_curves);
diff --git a/src/crypto/fipsmodule/ec/oct.c b/src/crypto/fipsmodule/ec/oct.c
index 96c138a1..3a6b4dd3 100644
--- a/src/crypto/fipsmodule/ec/oct.c
+++ b/src/crypto/fipsmodule/ec/oct.c
@@ -77,11 +77,9 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
                                       const EC_POINT *point,
                                       point_conversion_form_t form,
                                       uint8_t *buf, size_t len, BN_CTX *ctx) {
-  size_t ret;
+  size_t ret = 0;
   BN_CTX *new_ctx = NULL;
   int used_ctx = 0;
-  BIGNUM *x, *y;
-  size_t field_len, i;
 
   if ((form != POINT_CONVERSION_COMPRESSED) &&
       (form != POINT_CONVERSION_UNCOMPRESSED)) {
@@ -94,14 +92,16 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
     goto err;
   }
 
-  // ret := required output buffer length
-  field_len = BN_num_bytes(&group->field);
-  ret =
-      (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2 * field_len;
+  const size_t field_len = BN_num_bytes(&group->field);
+  size_t output_len = 1 /* type byte */ + field_len;
+  if (form == POINT_CONVERSION_UNCOMPRESSED) {
+    // Uncompressed points have a second coordinate.
+    output_len += field_len;
+  }
 
   // if 'buf' is NULL, just return required length
   if (buf != NULL) {
-    if (len < ret) {
+    if (len < output_len) {
       OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL);
       goto err;
     }
@@ -115,8 +115,8 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
 
     BN_CTX_start(ctx);
     used_ctx = 1;
-    x = BN_CTX_get(ctx);
-    y = BN_CTX_get(ctx);
+    BIGNUM *x = BN_CTX_get(ctx);
+    BIGNUM *y = BN_CTX_get(ctx);
     if (y == NULL) {
       goto err;
     }
@@ -131,7 +131,7 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
     } else {
       buf[0] = form;
     }
-    i = 1;
+    size_t i = 1;
 
     if (!BN_bn2bin_padded(buf + i, field_len, x)) {
       OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
@@ -147,70 +147,66 @@ static size_t ec_GFp_simple_point2oct(const EC_GROUP *group,
       i += field_len;
     }
 
-    if (i != ret) {
+    if (i != output_len) {
       OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR);
       goto err;
     }
   }
 
-  if (used_ctx) {
-    BN_CTX_end(ctx);
-  }
-  BN_CTX_free(new_ctx);
-  return ret;
+  ret = output_len;
 
 err:
   if (used_ctx) {
     BN_CTX_end(ctx);
   }
   BN_CTX_free(new_ctx);
-  return 0;
+  return ret;
 }
 
-
 static int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
                                    const uint8_t *buf, size_t len,
                                    BN_CTX *ctx) {
-  point_conversion_form_t form;
-  int y_bit;
   BN_CTX *new_ctx = NULL;
-  BIGNUM *x, *y;
-  size_t field_len, enc_len;
-  int ret = 0;
+  int ret = 0, used_ctx = 0;
 
   if (len == 0) {
     OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL);
-    return 0;
+    goto err;
   }
-  form = buf[0];
-  y_bit = form & 1;
+
+  point_conversion_form_t form = buf[0];
+  const int y_bit = form & 1;
   form = form & ~1U;
   if ((form != POINT_CONVERSION_COMPRESSED &&
        form != POINT_CONVERSION_UNCOMPRESSED) ||
       (form == POINT_CONVERSION_UNCOMPRESSED && y_bit)) {
     OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING);
-    return 0;
+    goto err;
   }
 
-  field_len = BN_num_bytes(&group->field);
-  enc_len =
-      (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2 * field_len;
+  const size_t field_len = BN_num_bytes(&group->field);
+  size_t enc_len = 1 /* type byte */ + field_len;
+  if (form == POINT_CONVERSION_UNCOMPRESSED) {
+    // Uncompressed points have a second coordinate.
+    enc_len += field_len;
+  }
 
   if (len != enc_len) {
     OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING);
-    return 0;
+    goto err;
   }
 
   if (ctx == NULL) {
     ctx = new_ctx = BN_CTX_new();
     if (ctx == NULL) {
-      return 0;
+      goto err;
     }
   }
 
   BN_CTX_start(ctx);
-  x = BN_CTX_get(ctx);
-  y = BN_CTX_get(ctx);
+  used_ctx = 1;
+  BIGNUM *x = BN_CTX_get(ctx);
+  BIGNUM *y = BN_CTX_get(ctx);
   if (x == NULL || y == NULL) {
     goto err;
   }
@@ -244,7 +240,9 @@ static int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
   ret = 1;
 
 err:
-  BN_CTX_end(ctx);
+  if (used_ctx) {
+    BN_CTX_end(ctx);
+  }
   BN_CTX_free(new_ctx);
   return ret;
 }
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index 0e79b6dc..ec371bf0 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -205,13 +205,7 @@ static void ecp_nistz256_mod_inverse_mont(BN_ULONG r[P256_LIMBS],
 // returns one if it fits. Otherwise it returns zero.
 static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS],
                                              const BIGNUM *in) {
-  if (in->top > P256_LIMBS) {
-    return 0;
-  }
-
-  OPENSSL_memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS);
-  OPENSSL_memcpy(out, in->d, sizeof(BN_ULONG) * in->top);
-  return 1;
+  return bn_copy_words(out, P256_LIMBS, in);
 }
 
 // r = p * p_scalar
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
index a802bfb5..5cd701ba 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -160,17 +160,16 @@ static bool PointToAffine(P256_POINT_AFFINE *out, const P256_POINT *in) {
     return false;
   }
 
-  OPENSSL_memset(out, 0, sizeof(P256_POINT_AFFINE));
-
   if (BN_is_zero(z.get())) {
     // The point at infinity is represented as (0, 0).
+    OPENSSL_memset(out, 0, sizeof(P256_POINT_AFFINE));
     return true;
   }
 
   bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
-  bssl::UniquePtr<BN_MONT_CTX> mont(BN_MONT_CTX_new());
+  bssl::UniquePtr<BN_MONT_CTX> mont(
+      BN_MONT_CTX_new_for_modulus(p.get(), ctx.get()));
   if (!ctx || !mont ||
-      !BN_MONT_CTX_set(mont.get(), p.get(), ctx.get()) ||
       // Invert Z.
       !BN_from_montgomery(z.get(), z.get(), mont.get(), ctx.get()) ||
       !BN_mod_inverse(z.get(), z.get(), p.get(), ctx.get()) ||
@@ -185,12 +184,11 @@ static bool PointToAffine(P256_POINT_AFFINE *out, const P256_POINT *in) {
       !BN_mod_mul_montgomery(y.get(), y.get(), z.get(), mont.get(),
                              ctx.get()) ||
       !BN_mod_mul_montgomery(y.get(), y.get(), z.get(), mont.get(),
-                             ctx.get())) {
+                             ctx.get()) ||
+      !bn_copy_words(out->X, P256_LIMBS, x.get()) ||
+      !bn_copy_words(out->Y, P256_LIMBS, y.get())) {
     return false;
   }
-
-  OPENSSL_memcpy(out->X, x->d, sizeof(BN_ULONG) * x->top);
-  OPENSSL_memcpy(out->Y, y->d, sizeof(BN_ULONG) * y->top);
   return true;
 }
 
diff --git a/src/crypto/fipsmodule/rsa/rsa_impl.c b/src/crypto/fipsmodule/rsa/rsa_impl.c
index b5a4e515..626bbe85 100644
--- a/src/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/src/crypto/fipsmodule/rsa/rsa_impl.c
@@ -646,12 +646,11 @@ err:
 static int mod_montgomery(BIGNUM *r, const BIGNUM *I, const BIGNUM *p,
                           const BN_MONT_CTX *mont_p, const BIGNUM *q,
                           BN_CTX *ctx) {
-  // Reduce in constant time with Montgomery reduction, which requires I <= p *
-  // R. If p and q are the same size, which is true for any RSA keys we or
-  // anyone sane generates, we have q < R and I < p * q, so this holds.
-  //
-  // If q is too big, fall back to |BN_mod|.
-  if (q->top > p->top) {
+  // Reducing in constant-time with Montgomery reduction requires I <= p * R. We
+  // have I < p * q, so this follows if q < R. In particular, this always holds
+  // if p and q are the same size, which is true for any RSA keys we or anyone
+  // sane generates. For other keys, we fall back to |BN_mod|.
+  if (!bn_less_than_montgomery_R(q, mont_p)) {
     return BN_mod(r, I, p, ctx);
   }
 
@@ -838,7 +837,8 @@ int rsa_greater_than_pow2(const BIGNUM *b, int n) {
 // relatively prime to |e|. If |p| is non-NULL, |out| will also not be close to
 // |p|.
 static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e,
-                          const BIGNUM *p, BN_CTX *ctx, BN_GENCB *cb) {
+                          const BIGNUM *p, const BIGNUM *sqrt2, BN_CTX *ctx,
+                          BN_GENCB *cb) {
   if (bits < 128 || (bits % BN_BITS2) != 0) {
     OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR);
     return 0;
@@ -882,30 +882,14 @@ static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e,
       }
     }
 
-    // If out < 2^(bits-1)×√2, try again (steps 4.4 and 5.5).
-    //
-    // We check the most significant words, so we retry if ⌊out/2^k⌋ <= ⌊b/2^k⌋,
-    // where b = 2^(bits-1)×√2 and k = max(0, bits - 1536). For key sizes up to
-    // 3072 (bits = 1536), k = 0, so we are testing that ⌊out⌋ <= ⌊b⌋. out is an
-    // integer and b is not, so this is equivalent to out < b. That is, the
-    // comparison is exact for FIPS key sizes.
+    // If out < 2^(bits-1)×√2, try again (steps 4.4 and 5.5). This is equivalent
+    // to out <= ⌊2^(bits-1)×√2⌋, or out <= sqrt2 for FIPS key sizes.
     //
     // For larger keys, the comparison is approximate, leaning towards
     // retrying. That is, we reject a negligible fraction of primes that are
     // within the FIPS bound, but we will never accept a prime outside the
-    // bound, ensuring the resulting RSA key is the right size. Specifically, if
-    // the FIPS bound holds, we have ⌊out/2^k⌋ < out/2^k < b/2^k. This implies
-    // ⌊out/2^k⌋ <= ⌊b/2^k⌋. That is, the FIPS bound implies our bound and so we
-    // are slightly tighter.
-    size_t out_len = (size_t)out->top;
-    assert(out_len == (size_t)bits / BN_BITS2);
-    size_t to_check = kBoringSSLRSASqrtTwoLen;
-    if (to_check > out_len) {
-      to_check = out_len;
-    }
-    if (!bn_less_than_words(
-            kBoringSSLRSASqrtTwo + kBoringSSLRSASqrtTwoLen - to_check,
-            out->d + out_len - to_check, to_check)) {
+    // bound, ensuring the resulting RSA key is the right size.
+    if (!BN_less_than_consttime(sqrt2, out)) {
       continue;
     }
 
@@ -969,7 +953,9 @@ int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) {
   BIGNUM *pm1 = BN_CTX_get(ctx);
   BIGNUM *qm1 = BN_CTX_get(ctx);
   BIGNUM *gcd = BN_CTX_get(ctx);
-  if (totient == NULL || pm1 == NULL || qm1 == NULL || gcd == NULL) {
+  BIGNUM *sqrt2 = BN_CTX_get(ctx);
+  if (totient == NULL || pm1 == NULL || qm1 == NULL || gcd == NULL ||
+      sqrt2 == NULL) {
     goto bn_err;
   }
 
@@ -990,12 +976,35 @@ int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) {
   }
 
   int prime_bits = bits / 2;
+
+  // Compute sqrt2 >= ⌊2^(prime_bits-1)×√2⌋.
+  if (!bn_set_words(sqrt2, kBoringSSLRSASqrtTwo, kBoringSSLRSASqrtTwoLen)) {
+    goto bn_err;
+  }
+  int sqrt2_bits = kBoringSSLRSASqrtTwoLen * BN_BITS2;
+  assert(sqrt2_bits == (int)BN_num_bits(sqrt2));
+  if (sqrt2_bits > prime_bits) {
+    // For key sizes up to 3072 (prime_bits = 1536), this is exactly
+    // ⌊2^(prime_bits-1)×√2⌋.
+    if (!BN_rshift(sqrt2, sqrt2, sqrt2_bits - prime_bits)) {
+      goto bn_err;
+    }
+  } else if (prime_bits > sqrt2_bits) {
+    // For key sizes beyond 3072, this is approximate. We err towards retrying
+    // to ensure our key is the right size and round up.
+    if (!BN_add_word(sqrt2, 1) ||
+        !BN_lshift(sqrt2, sqrt2, prime_bits - sqrt2_bits)) {
+      goto bn_err;
+    }
+  }
+  assert(prime_bits == (int)BN_num_bits(sqrt2));
+
   do {
     // Generate p and q, each of size |prime_bits|, using the steps outlined in
     // appendix FIPS 186-4 appendix B.3.3.
-    if (!generate_prime(rsa->p, prime_bits, rsa->e, NULL, ctx, cb) ||
+    if (!generate_prime(rsa->p, prime_bits, rsa->e, NULL, sqrt2, ctx, cb) ||
         !BN_GENCB_call(cb, 3, 0) ||
-        !generate_prime(rsa->q, prime_bits, rsa->e, rsa->p, ctx, cb) ||
+        !generate_prime(rsa->q, prime_bits, rsa->e, rsa->p, sqrt2, ctx, cb) ||
         !BN_GENCB_call(cb, 3, 1)) {
       goto bn_err;
     }
author	Robert Sloan <varomodt@google.com>	2018-02-05 09:07:34 -0800
committer	Robert Sloan <varomodt@google.com>	2018-02-05 09:07:39 -0800
commit	8542c08a00c332af2ebca2a0c64b8d4d5fbd4cd2 (patch)
tree	65345a0acda3104c65b39662f207fbc9239e9ad5 /src/crypto
parent	309a31e32558286a3b92c754bd3051b962527c25 (diff)
download	boringssl-8542c08a00c332af2ebca2a0c64b8d4d5fbd4cd2.tar.gz