summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Sloan <varomodt@google.com>2018-12-19 01:56:17 -0800
committerandroid-build-merger <android-build-merger@google.com>2018-12-19 01:56:17 -0800
commitef3fb2c5dd0fa43d39305174a337d5fc4817762f (patch)
tree37e1b45a3f85d38626637fe6dd2bdca7426d4ba5
parentf669ac9660ae2b7e6aca1a1b8b1542f4557d2197 (diff)
parent8aa0d177a330be513f42081bc0017a354ca2b2c0 (diff)
downloadboringssl-ef3fb2c5dd0fa43d39305174a337d5fc4817762f.tar.gz
external/boringssl: Sync to 41c10e2b5f37edce8b9f292f7f3bacb7e30e25c4. am: 11c28bd346
am: 8aa0d177a3 Change-Id: Iecc178230eda48edf9829380b89d65ba3aee50ac
-rw-r--r--BORINGSSL_REVISION2
-rw-r--r--eureka.mk2
-rw-r--r--linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S976
-rw-r--r--linux-x86_64/crypto/fipsmodule/sha512-x86_64.S1103
-rw-r--r--mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S964
-rw-r--r--mac-x86_64/crypto/fipsmodule/sha512-x86_64.S1103
-rw-r--r--sources.bp3
-rw-r--r--sources.mk2
-rw-r--r--src/crypto/CMakeLists.txt3
-rw-r--r--src/crypto/bio/bio.c52
-rw-r--r--src/crypto/cipher_extra/e_aesgcmsiv.c16
-rw-r--r--src/crypto/cpu-intel.c50
-rw-r--r--src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl11
-rw-r--r--src/crypto/fipsmodule/aes/internal.h2
-rw-r--r--src/crypto/fipsmodule/cipher/e_aes.c2
-rw-r--r--src/crypto/fipsmodule/ec/p256-x86_64.c2
-rwxr-xr-xsrc/crypto/fipsmodule/sha/asm/sha512-x86_64.pl410
-rw-r--r--src/crypto/hrss/asm/poly_rq_mul.S8457
-rw-r--r--src/crypto/hrss/hrss.c2237
-rw-r--r--src/crypto/hrss/hrss_test.cc472
-rw-r--r--src/crypto/hrss/internal.h50
-rw-r--r--src/crypto/obj/obj_dat.h5
-rw-r--r--src/crypto/obj/obj_mac.num1
-rw-r--r--src/crypto/obj/objects.txt9
-rw-r--r--src/crypto/thread_win.c17
-rw-r--r--src/crypto/x509/x509_test.cc60
-rw-r--r--src/include/openssl/bio.h1
-rw-r--r--src/include/openssl/bn.h2
-rw-r--r--src/include/openssl/hrss.h102
-rw-r--r--src/include/openssl/nid.h3
-rw-r--r--src/include/openssl/ssl.h9
-rw-r--r--src/ssl/handoff.cc4
-rw-r--r--src/ssl/handshake_client.cc12
-rw-r--r--src/ssl/handshake_server.cc11
-rw-r--r--src/ssl/internal.h14
-rw-r--r--src/ssl/ssl_asn1.cc10
-rw-r--r--src/ssl/ssl_key_share.cc100
-rw-r--r--src/ssl/ssl_lib.cc21
-rw-r--r--src/ssl/ssl_test.cc64
-rw-r--r--src/ssl/ssl_x509.cc74
-rw-r--r--src/ssl/t1_lib.cc80
-rw-r--r--src/ssl/test/bssl_shim.cc18
-rw-r--r--src/ssl/test/runner/cipher_suites.go2
-rw-r--r--src/ssl/test/runner/common.go15
-rw-r--r--src/ssl/test/runner/handshake_client.go3
-rw-r--r--src/ssl/test/runner/handshake_messages.go37
-rw-r--r--src/ssl/test/runner/handshake_server.go27
-rw-r--r--src/ssl/test/runner/hrss/hrss.go1212
-rw-r--r--src/ssl/test/runner/key_agreement.go102
-rw-r--r--src/ssl/test/runner/runner.go314
-rw-r--r--src/ssl/test/test_config.cc53
-rw-r--r--src/ssl/test/test_config.h4
-rw-r--r--src/ssl/tls13_client.cc11
-rw-r--r--src/ssl/tls13_server.cc29
-rw-r--r--src/tool/speed.cc59
-rw-r--r--src/util/generate_build_files.py3
-rw-r--r--win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm1034
-rw-r--r--win-x86_64/crypto/fipsmodule/sha512-x86_64.asm1133
58 files changed, 13604 insertions, 6970 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index d6108a3e..f3ca0a33 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-0f5ecd3a854546d943104e1f7421e489b7f4d5aa
+41c10e2b5f37edce8b9f292f7f3bacb7e30e25c4
diff --git a/eureka.mk b/eureka.mk
index 41cd787f..5cb01dc4 100644
--- a/eureka.mk
+++ b/eureka.mk
@@ -118,6 +118,7 @@ crypto_sources := \
src/crypto/fipsmodule/bcm.c\
src/crypto/fipsmodule/is_fips.c\
src/crypto/hkdf/hkdf.c\
+ src/crypto/hrss/hrss.c\
src/crypto/lhash/lhash.c\
src/crypto/mem.c\
src/crypto/obj/obj.c\
@@ -354,4 +355,5 @@ linux_x86_64_sources := \
linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S\
linux-x86_64/crypto/fipsmodule/x86_64-mont.S\
linux-x86_64/crypto/fipsmodule/x86_64-mont5.S\
+ src/crypto/hrss/asm/poly_rq_mul.S\
diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index 7dd3161b..36c01ef9 100644
--- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -1576,982 +1576,6 @@ bsaes_ctr32_encrypt_blocks:
.byte 0xf3,0xc3
.cfi_endproc
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-.globl bsaes_xts_encrypt
-.hidden bsaes_xts_encrypt
-.type bsaes_xts_encrypt,@function
-.align 16
-bsaes_xts_encrypt:
-.cfi_startproc
- movq %rsp,%rax
-.Lxts_enc_prologue:
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- leaq -72(%rsp),%rsp
-.cfi_adjust_cfa_offset 0x48
- movq %rsp,%rbp
-.cfi_def_cfa_register %rbp
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- movq %rcx,%r15
-
- leaq (%r9),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r8),%rdx
- call aes_nohw_encrypt
-
- movl 240(%r15),%eax
- movq %r14,%rbx
-
- movl %eax,%edx
- shlq $7,%rax
- subq $96,%rax
- subq %rax,%rsp
-
- movq %rsp,%rax
- movq %r15,%rcx
- movl %edx,%r10d
- call _bsaes_key_convert
- pxor %xmm6,%xmm7
- movdqa %xmm7,(%rax)
-
- andq $-16,%r14
- subq $0x80,%rsp
- movdqa 32(%rbp),%xmm6
-
- pxor %xmm14,%xmm14
- movdqa .Lxts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
-
- subq $0x80,%r14
- jc .Lxts_enc_short
- jmp .Lxts_enc_loop
-
-.align 16
-.Lxts_enc_loop:
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqu 112(%r12),%xmm14
- leaq 128(%r12),%r12
- movdqa %xmm6,112(%rsp)
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- pxor %xmm14,%xmm6
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm2,64(%r13)
- pxor 96(%rsp),%xmm1
- movdqu %xmm6,80(%r13)
- pxor 112(%rsp),%xmm4
- movdqu %xmm1,96(%r13)
- movdqu %xmm4,112(%r13)
- leaq 128(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- pxor %xmm14,%xmm14
- movdqa .Lxts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
-
- subq $0x80,%r14
- jnc .Lxts_enc_loop
-
-.Lxts_enc_short:
- addq $0x80,%r14
- jz .Lxts_enc_done
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- cmpq $16,%r14
- je .Lxts_enc_1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- cmpq $32,%r14
- je .Lxts_enc_2
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- cmpq $48,%r14
- je .Lxts_enc_3
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- cmpq $64,%r14
- je .Lxts_enc_4
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- cmpq $80,%r14
- je .Lxts_enc_5
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- cmpq $96,%r14
- je .Lxts_enc_6
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqa %xmm6,112(%rsp)
- leaq 112(%r12),%r12
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm2,64(%r13)
- pxor 96(%rsp),%xmm1
- movdqu %xmm6,80(%r13)
- movdqu %xmm1,96(%r13)
- leaq 112(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- jmp .Lxts_enc_done
-.align 16
-.Lxts_enc_6:
- pxor %xmm11,%xmm3
- leaq 96(%r12),%r12
- pxor %xmm12,%xmm4
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm2,64(%r13)
- movdqu %xmm6,80(%r13)
- leaq 96(%r13),%r13
-
- movdqa 96(%rsp),%xmm6
- jmp .Lxts_enc_done
-.align 16
-.Lxts_enc_5:
- pxor %xmm10,%xmm2
- leaq 80(%r12),%r12
- pxor %xmm11,%xmm3
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- movdqu %xmm2,64(%r13)
- leaq 80(%r13),%r13
-
- movdqa 80(%rsp),%xmm6
- jmp .Lxts_enc_done
-.align 16
-.Lxts_enc_4:
- pxor %xmm9,%xmm1
- leaq 64(%r12),%r12
- pxor %xmm10,%xmm2
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- movdqu %xmm5,48(%r13)
- leaq 64(%r13),%r13
-
- movdqa 64(%rsp),%xmm6
- jmp .Lxts_enc_done
-.align 16
-.Lxts_enc_3:
- pxor %xmm8,%xmm0
- leaq 48(%r12),%r12
- pxor %xmm9,%xmm1
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- movdqu %xmm3,32(%r13)
- leaq 48(%r13),%r13
-
- movdqa 48(%rsp),%xmm6
- jmp .Lxts_enc_done
-.align 16
-.Lxts_enc_2:
- pxor %xmm7,%xmm15
- leaq 32(%r12),%r12
- pxor %xmm8,%xmm0
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- movdqu %xmm0,16(%r13)
- leaq 32(%r13),%r13
-
- movdqa 32(%rsp),%xmm6
- jmp .Lxts_enc_done
-.align 16
-.Lxts_enc_1:
- pxor %xmm15,%xmm7
- leaq 16(%r12),%r12
- movdqa %xmm7,32(%rbp)
- leaq 32(%rbp),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r15),%rdx
- call aes_nohw_encrypt
- pxor 32(%rbp),%xmm15
-
-
-
-
-
- movdqu %xmm15,0(%r13)
- leaq 16(%r13),%r13
-
- movdqa 16(%rsp),%xmm6
-
-.Lxts_enc_done:
- andl $15,%ebx
- jz .Lxts_enc_ret
- movq %r13,%rdx
-
-.Lxts_enc_steal:
- movzbl (%r12),%eax
- movzbl -16(%rdx),%ecx
- leaq 1(%r12),%r12
- movb %al,-16(%rdx)
- movb %cl,0(%rdx)
- leaq 1(%rdx),%rdx
- subl $1,%ebx
- jnz .Lxts_enc_steal
-
- movdqu -16(%r13),%xmm15
- leaq 32(%rbp),%rdi
- pxor %xmm6,%xmm15
- leaq 32(%rbp),%rsi
- movdqa %xmm15,32(%rbp)
- leaq (%r15),%rdx
- call aes_nohw_encrypt
- pxor 32(%rbp),%xmm6
- movdqu %xmm6,-16(%r13)
-
-.Lxts_enc_ret:
- leaq (%rsp),%rax
- pxor %xmm0,%xmm0
-.Lxts_enc_bzero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- leaq 32(%rax),%rax
- cmpq %rax,%rbp
- ja .Lxts_enc_bzero
-
- leaq 120(%rbp),%rax
-.cfi_def_cfa %rax,8
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbx
-.cfi_restore %rbx
- movq -8(%rax),%rbp
-.cfi_restore %rbp
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lxts_enc_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-.globl bsaes_xts_decrypt
-.hidden bsaes_xts_decrypt
-.type bsaes_xts_decrypt,@function
-.align 16
-bsaes_xts_decrypt:
-.cfi_startproc
- movq %rsp,%rax
-.Lxts_dec_prologue:
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-16
- pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-24
- pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
- leaq -72(%rsp),%rsp
-.cfi_adjust_cfa_offset 0x48
- movq %rsp,%rbp
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- movq %rcx,%r15
-
- leaq (%r9),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r8),%rdx
- call aes_nohw_encrypt
-
- movl 240(%r15),%eax
- movq %r14,%rbx
-
- movl %eax,%edx
- shlq $7,%rax
- subq $96,%rax
- subq %rax,%rsp
-
- movq %rsp,%rax
- movq %r15,%rcx
- movl %edx,%r10d
- call _bsaes_key_convert
- pxor (%rsp),%xmm7
- movdqa %xmm6,(%rax)
- movdqa %xmm7,(%rsp)
-
- xorl %eax,%eax
- andq $-16,%r14
- testl $15,%ebx
- setnz %al
- shlq $4,%rax
- subq %rax,%r14
-
- subq $0x80,%rsp
- movdqa 32(%rbp),%xmm6
-
- pxor %xmm14,%xmm14
- movdqa .Lxts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
-
- subq $0x80,%r14
- jc .Lxts_dec_short
- jmp .Lxts_dec_loop
-
-.align 16
-.Lxts_dec_loop:
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqu 112(%r12),%xmm14
- leaq 128(%r12),%r12
- movdqa %xmm6,112(%rsp)
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- pxor %xmm14,%xmm6
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm1,64(%r13)
- pxor 96(%rsp),%xmm2
- movdqu %xmm6,80(%r13)
- pxor 112(%rsp),%xmm4
- movdqu %xmm2,96(%r13)
- movdqu %xmm4,112(%r13)
- leaq 128(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- pxor %xmm14,%xmm14
- movdqa .Lxts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
-
- subq $0x80,%r14
- jnc .Lxts_dec_loop
-
-.Lxts_dec_short:
- addq $0x80,%r14
- jz .Lxts_dec_done
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- cmpq $16,%r14
- je .Lxts_dec_1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- cmpq $32,%r14
- je .Lxts_dec_2
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- cmpq $48,%r14
- je .Lxts_dec_3
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- cmpq $64,%r14
- je .Lxts_dec_4
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- cmpq $80,%r14
- je .Lxts_dec_5
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- cmpq $96,%r14
- je .Lxts_dec_6
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqa %xmm6,112(%rsp)
- leaq 112(%r12),%r12
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm1,64(%r13)
- pxor 96(%rsp),%xmm2
- movdqu %xmm6,80(%r13)
- movdqu %xmm2,96(%r13)
- leaq 112(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- jmp .Lxts_dec_done
-.align 16
-.Lxts_dec_6:
- pxor %xmm11,%xmm3
- leaq 96(%r12),%r12
- pxor %xmm12,%xmm4
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm1,64(%r13)
- movdqu %xmm6,80(%r13)
- leaq 96(%r13),%r13
-
- movdqa 96(%rsp),%xmm6
- jmp .Lxts_dec_done
-.align 16
-.Lxts_dec_5:
- pxor %xmm10,%xmm2
- leaq 80(%r12),%r12
- pxor %xmm11,%xmm3
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- movdqu %xmm1,64(%r13)
- leaq 80(%r13),%r13
-
- movdqa 80(%rsp),%xmm6
- jmp .Lxts_dec_done
-.align 16
-.Lxts_dec_4:
- pxor %xmm9,%xmm1
- leaq 64(%r12),%r12
- pxor %xmm10,%xmm2
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- movdqu %xmm3,48(%r13)
- leaq 64(%r13),%r13
-
- movdqa 64(%rsp),%xmm6
- jmp .Lxts_dec_done
-.align 16
-.Lxts_dec_3:
- pxor %xmm8,%xmm0
- leaq 48(%r12),%r12
- pxor %xmm9,%xmm1
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- movdqu %xmm5,32(%r13)
- leaq 48(%r13),%r13
-
- movdqa 48(%rsp),%xmm6
- jmp .Lxts_dec_done
-.align 16
-.Lxts_dec_2:
- pxor %xmm7,%xmm15
- leaq 32(%r12),%r12
- pxor %xmm8,%xmm0
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- movdqu %xmm0,16(%r13)
- leaq 32(%r13),%r13
-
- movdqa 32(%rsp),%xmm6
- jmp .Lxts_dec_done
-.align 16
-.Lxts_dec_1:
- pxor %xmm15,%xmm7
- leaq 16(%r12),%r12
- movdqa %xmm7,32(%rbp)
- leaq 32(%rbp),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r15),%rdx
- call aes_nohw_decrypt
- pxor 32(%rbp),%xmm15
-
-
-
-
-
- movdqu %xmm15,0(%r13)
- leaq 16(%r13),%r13
-
- movdqa 16(%rsp),%xmm6
-
-.Lxts_dec_done:
- andl $15,%ebx
- jz .Lxts_dec_ret
-
- pxor %xmm14,%xmm14
- movdqa .Lxts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
- pshufd $0x13,%xmm14,%xmm13
- movdqa %xmm6,%xmm5
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- movdqu (%r12),%xmm15
- pxor %xmm13,%xmm6
-
- leaq 32(%rbp),%rdi
- pxor %xmm6,%xmm15
- leaq 32(%rbp),%rsi
- movdqa %xmm15,32(%rbp)
- leaq (%r15),%rdx
- call aes_nohw_decrypt
- pxor 32(%rbp),%xmm6
- movq %r13,%rdx
- movdqu %xmm6,(%r13)
-
-.Lxts_dec_steal:
- movzbl 16(%r12),%eax
- movzbl (%rdx),%ecx
- leaq 1(%r12),%r12
- movb %al,(%rdx)
- movb %cl,16(%rdx)
- leaq 1(%rdx),%rdx
- subl $1,%ebx
- jnz .Lxts_dec_steal
-
- movdqu (%r13),%xmm15
- leaq 32(%rbp),%rdi
- pxor %xmm5,%xmm15
- leaq 32(%rbp),%rsi
- movdqa %xmm15,32(%rbp)
- leaq (%r15),%rdx
- call aes_nohw_decrypt
- pxor 32(%rbp),%xmm5
- movdqu %xmm5,(%r13)
-
-.Lxts_dec_ret:
- leaq (%rsp),%rax
- pxor %xmm0,%xmm0
-.Lxts_dec_bzero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- leaq 32(%rax),%rax
- cmpq %rax,%rbp
- ja .Lxts_dec_bzero
-
- leaq 120(%rbp),%rax
-.cfi_def_cfa %rax,8
- movq -48(%rax),%r15
-.cfi_restore %r15
- movq -40(%rax),%r14
-.cfi_restore %r14
- movq -32(%rax),%r13
-.cfi_restore %r13
- movq -24(%rax),%r12
-.cfi_restore %r12
- movq -16(%rax),%rbx
-.cfi_restore %rbx
- movq -8(%rax),%rbp
-.cfi_restore %rbp
- leaq (%rax),%rsp
-.cfi_def_cfa_register %rsp
-.Lxts_dec_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
.type _bsaes_const,@object
.align 64
_bsaes_const:
diff --git a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
index 3c47199f..509e144e 100644
--- a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
@@ -25,8 +25,6 @@ sha512_block_data_order:
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
- testl $2048,%r10d
- jnz .Lxop_shortcut
andl $1073741824,%r9d
andl $268435968,%r10d
orl %r9d,%r10d
@@ -1825,1107 +1823,6 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.type sha512_block_data_order_xop,@function
-.align 64
-sha512_block_data_order_xop:
-.cfi_startproc
-.Lxop_shortcut:
- movq %rsp,%rax
-.cfi_def_cfa_register %rax
- pushq %rbx
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_offset %rbp,-24
- pushq %r12
-.cfi_offset %r12,-32
- pushq %r13
-.cfi_offset %r13,-40
- pushq %r14
-.cfi_offset %r14,-48
- pushq %r15
-.cfi_offset %r15,-56
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
-.Lprologue_xop:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp .Lloop_xop
-.align 16
-.Lloop_xop:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm0,%xmm0
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,223,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm7,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm0,%xmm0
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm1,%xmm1
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,216,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm0,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm1,%xmm1
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm2,%xmm2
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,217,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm1,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm2,%xmm2
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm3,%xmm3
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,218,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm2,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm3,%xmm3
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm4,%xmm4
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,219,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm3,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm4,%xmm4
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm5,%xmm5
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,220,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm4,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm5,%xmm5
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm6,%xmm6
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,221,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm5,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm6,%xmm6
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm7,%xmm7
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,222,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm6,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm7,%xmm7
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne .Lxop_00_47
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb .Lloop_xop
-
- movq 152(%rsp),%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
- movq -48(%rsi),%r15
-.cfi_restore %r15
- movq -40(%rsi),%r14
-.cfi_restore %r14
- movq -32(%rsi),%r13
-.cfi_restore %r13
- movq -24(%rsi),%r12
-.cfi_restore %r12
- movq -16(%rsi),%rbp
-.cfi_restore %rbp
- movq -8(%rsi),%rbx
-.cfi_restore %rbx
- leaq (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_xop:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
.type sha512_block_data_order_avx,@function
.align 64
sha512_block_data_order_avx:
diff --git a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
index d0668ca2..0149e0e5 100644
--- a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
+++ b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S
@@ -1561,970 +1561,6 @@ L$ctr_enc_epilogue:
.byte 0xf3,0xc3
-.globl _bsaes_xts_encrypt
-.private_extern _bsaes_xts_encrypt
-
-.p2align 4
-_bsaes_xts_encrypt:
-
- movq %rsp,%rax
-L$xts_enc_prologue:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- leaq -72(%rsp),%rsp
-
- movq %rsp,%rbp
-
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- movq %rcx,%r15
-
- leaq (%r9),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r8),%rdx
- call _aes_nohw_encrypt
-
- movl 240(%r15),%eax
- movq %r14,%rbx
-
- movl %eax,%edx
- shlq $7,%rax
- subq $96,%rax
- subq %rax,%rsp
-
- movq %rsp,%rax
- movq %r15,%rcx
- movl %edx,%r10d
- call _bsaes_key_convert
- pxor %xmm6,%xmm7
- movdqa %xmm7,(%rax)
-
- andq $-16,%r14
- subq $0x80,%rsp
- movdqa 32(%rbp),%xmm6
-
- pxor %xmm14,%xmm14
- movdqa L$xts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
-
- subq $0x80,%r14
- jc L$xts_enc_short
- jmp L$xts_enc_loop
-
-.p2align 4
-L$xts_enc_loop:
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqu 112(%r12),%xmm14
- leaq 128(%r12),%r12
- movdqa %xmm6,112(%rsp)
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- pxor %xmm14,%xmm6
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm2,64(%r13)
- pxor 96(%rsp),%xmm1
- movdqu %xmm6,80(%r13)
- pxor 112(%rsp),%xmm4
- movdqu %xmm1,96(%r13)
- movdqu %xmm4,112(%r13)
- leaq 128(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- pxor %xmm14,%xmm14
- movdqa L$xts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
-
- subq $0x80,%r14
- jnc L$xts_enc_loop
-
-L$xts_enc_short:
- addq $0x80,%r14
- jz L$xts_enc_done
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- cmpq $16,%r14
- je L$xts_enc_1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- cmpq $32,%r14
- je L$xts_enc_2
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- cmpq $48,%r14
- je L$xts_enc_3
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- cmpq $64,%r14
- je L$xts_enc_4
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- cmpq $80,%r14
- je L$xts_enc_5
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- cmpq $96,%r14
- je L$xts_enc_6
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqa %xmm6,112(%rsp)
- leaq 112(%r12),%r12
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm2,64(%r13)
- pxor 96(%rsp),%xmm1
- movdqu %xmm6,80(%r13)
- movdqu %xmm1,96(%r13)
- leaq 112(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- jmp L$xts_enc_done
-.p2align 4
-L$xts_enc_6:
- pxor %xmm11,%xmm3
- leaq 96(%r12),%r12
- pxor %xmm12,%xmm4
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm2,64(%r13)
- movdqu %xmm6,80(%r13)
- leaq 96(%r13),%r13
-
- movdqa 96(%rsp),%xmm6
- jmp L$xts_enc_done
-.p2align 4
-L$xts_enc_5:
- pxor %xmm10,%xmm2
- leaq 80(%r12),%r12
- pxor %xmm11,%xmm3
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- pxor 64(%rsp),%xmm2
- movdqu %xmm5,48(%r13)
- movdqu %xmm2,64(%r13)
- leaq 80(%r13),%r13
-
- movdqa 80(%rsp),%xmm6
- jmp L$xts_enc_done
-.p2align 4
-L$xts_enc_4:
- pxor %xmm9,%xmm1
- leaq 64(%r12),%r12
- pxor %xmm10,%xmm2
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm5
- movdqu %xmm3,32(%r13)
- movdqu %xmm5,48(%r13)
- leaq 64(%r13),%r13
-
- movdqa 64(%rsp),%xmm6
- jmp L$xts_enc_done
-.p2align 4
-L$xts_enc_3:
- pxor %xmm8,%xmm0
- leaq 48(%r12),%r12
- pxor %xmm9,%xmm1
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm3
- movdqu %xmm0,16(%r13)
- movdqu %xmm3,32(%r13)
- leaq 48(%r13),%r13
-
- movdqa 48(%rsp),%xmm6
- jmp L$xts_enc_done
-.p2align 4
-L$xts_enc_2:
- pxor %xmm7,%xmm15
- leaq 32(%r12),%r12
- pxor %xmm8,%xmm0
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_encrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- movdqu %xmm0,16(%r13)
- leaq 32(%r13),%r13
-
- movdqa 32(%rsp),%xmm6
- jmp L$xts_enc_done
-.p2align 4
-L$xts_enc_1:
- pxor %xmm15,%xmm7
- leaq 16(%r12),%r12
- movdqa %xmm7,32(%rbp)
- leaq 32(%rbp),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r15),%rdx
- call _aes_nohw_encrypt
- pxor 32(%rbp),%xmm15
-
-
-
-
-
- movdqu %xmm15,0(%r13)
- leaq 16(%r13),%r13
-
- movdqa 16(%rsp),%xmm6
-
-L$xts_enc_done:
- andl $15,%ebx
- jz L$xts_enc_ret
- movq %r13,%rdx
-
-L$xts_enc_steal:
- movzbl (%r12),%eax
- movzbl -16(%rdx),%ecx
- leaq 1(%r12),%r12
- movb %al,-16(%rdx)
- movb %cl,0(%rdx)
- leaq 1(%rdx),%rdx
- subl $1,%ebx
- jnz L$xts_enc_steal
-
- movdqu -16(%r13),%xmm15
- leaq 32(%rbp),%rdi
- pxor %xmm6,%xmm15
- leaq 32(%rbp),%rsi
- movdqa %xmm15,32(%rbp)
- leaq (%r15),%rdx
- call _aes_nohw_encrypt
- pxor 32(%rbp),%xmm6
- movdqu %xmm6,-16(%r13)
-
-L$xts_enc_ret:
- leaq (%rsp),%rax
- pxor %xmm0,%xmm0
-L$xts_enc_bzero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- leaq 32(%rax),%rax
- cmpq %rax,%rbp
- ja L$xts_enc_bzero
-
- leaq 120(%rbp),%rax
-
- movq -48(%rax),%r15
-
- movq -40(%rax),%r14
-
- movq -32(%rax),%r13
-
- movq -24(%rax),%r12
-
- movq -16(%rax),%rbx
-
- movq -8(%rax),%rbp
-
- leaq (%rax),%rsp
-
-L$xts_enc_epilogue:
- .byte 0xf3,0xc3
-
-
-
-.globl _bsaes_xts_decrypt
-.private_extern _bsaes_xts_decrypt
-
-.p2align 4
-_bsaes_xts_decrypt:
-
- movq %rsp,%rax
-L$xts_dec_prologue:
- pushq %rbp
-
- pushq %rbx
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- leaq -72(%rsp),%rsp
-
- movq %rsp,%rbp
- movq %rdi,%r12
- movq %rsi,%r13
- movq %rdx,%r14
- movq %rcx,%r15
-
- leaq (%r9),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r8),%rdx
- call _aes_nohw_encrypt
-
- movl 240(%r15),%eax
- movq %r14,%rbx
-
- movl %eax,%edx
- shlq $7,%rax
- subq $96,%rax
- subq %rax,%rsp
-
- movq %rsp,%rax
- movq %r15,%rcx
- movl %edx,%r10d
- call _bsaes_key_convert
- pxor (%rsp),%xmm7
- movdqa %xmm6,(%rax)
- movdqa %xmm7,(%rsp)
-
- xorl %eax,%eax
- andq $-16,%r14
- testl $15,%ebx
- setnz %al
- shlq $4,%rax
- subq %rax,%r14
-
- subq $0x80,%rsp
- movdqa 32(%rbp),%xmm6
-
- pxor %xmm14,%xmm14
- movdqa L$xts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
-
- subq $0x80,%r14
- jc L$xts_dec_short
- jmp L$xts_dec_loop
-
-.p2align 4
-L$xts_dec_loop:
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqu 112(%r12),%xmm14
- leaq 128(%r12),%r12
- movdqa %xmm6,112(%rsp)
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- pxor %xmm14,%xmm6
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm1,64(%r13)
- pxor 96(%rsp),%xmm2
- movdqu %xmm6,80(%r13)
- pxor 112(%rsp),%xmm4
- movdqu %xmm2,96(%r13)
- movdqu %xmm4,112(%r13)
- leaq 128(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- pxor %xmm14,%xmm14
- movdqa L$xts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
-
- subq $0x80,%r14
- jnc L$xts_dec_loop
-
-L$xts_dec_short:
- addq $0x80,%r14
- jz L$xts_dec_done
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm15
- movdqa %xmm6,0(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm0
- movdqa %xmm6,16(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 0(%r12),%xmm7
- cmpq $16,%r14
- je L$xts_dec_1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm1
- movdqa %xmm6,32(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 16(%r12),%xmm8
- cmpq $32,%r14
- je L$xts_dec_2
- pxor %xmm7,%xmm15
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm2
- movdqa %xmm6,48(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 32(%r12),%xmm9
- cmpq $48,%r14
- je L$xts_dec_3
- pxor %xmm8,%xmm0
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm3
- movdqa %xmm6,64(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 48(%r12),%xmm10
- cmpq $64,%r14
- je L$xts_dec_4
- pxor %xmm9,%xmm1
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm4
- movdqa %xmm6,80(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 64(%r12),%xmm11
- cmpq $80,%r14
- je L$xts_dec_5
- pxor %xmm10,%xmm2
- pshufd $0x13,%xmm14,%xmm13
- pxor %xmm14,%xmm14
- movdqa %xmm6,%xmm5
- movdqa %xmm6,96(%rsp)
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- pcmpgtd %xmm6,%xmm14
- pxor %xmm13,%xmm6
- movdqu 80(%r12),%xmm12
- cmpq $96,%r14
- je L$xts_dec_6
- pxor %xmm11,%xmm3
- movdqu 96(%r12),%xmm13
- pxor %xmm12,%xmm4
- movdqa %xmm6,112(%rsp)
- leaq 112(%r12),%r12
- pxor %xmm13,%xmm5
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm1,64(%r13)
- pxor 96(%rsp),%xmm2
- movdqu %xmm6,80(%r13)
- movdqu %xmm2,96(%r13)
- leaq 112(%r13),%r13
-
- movdqa 112(%rsp),%xmm6
- jmp L$xts_dec_done
-.p2align 4
-L$xts_dec_6:
- pxor %xmm11,%xmm3
- leaq 96(%r12),%r12
- pxor %xmm12,%xmm4
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- pxor 80(%rsp),%xmm6
- movdqu %xmm1,64(%r13)
- movdqu %xmm6,80(%r13)
- leaq 96(%r13),%r13
-
- movdqa 96(%rsp),%xmm6
- jmp L$xts_dec_done
-.p2align 4
-L$xts_dec_5:
- pxor %xmm10,%xmm2
- leaq 80(%r12),%r12
- pxor %xmm11,%xmm3
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- pxor 64(%rsp),%xmm1
- movdqu %xmm3,48(%r13)
- movdqu %xmm1,64(%r13)
- leaq 80(%r13),%r13
-
- movdqa 80(%rsp),%xmm6
- jmp L$xts_dec_done
-.p2align 4
-L$xts_dec_4:
- pxor %xmm9,%xmm1
- leaq 64(%r12),%r12
- pxor %xmm10,%xmm2
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- pxor 48(%rsp),%xmm3
- movdqu %xmm5,32(%r13)
- movdqu %xmm3,48(%r13)
- leaq 64(%r13),%r13
-
- movdqa 64(%rsp),%xmm6
- jmp L$xts_dec_done
-.p2align 4
-L$xts_dec_3:
- pxor %xmm8,%xmm0
- leaq 48(%r12),%r12
- pxor %xmm9,%xmm1
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- pxor 32(%rsp),%xmm5
- movdqu %xmm0,16(%r13)
- movdqu %xmm5,32(%r13)
- leaq 48(%r13),%r13
-
- movdqa 48(%rsp),%xmm6
- jmp L$xts_dec_done
-.p2align 4
-L$xts_dec_2:
- pxor %xmm7,%xmm15
- leaq 32(%r12),%r12
- pxor %xmm8,%xmm0
- leaq 128(%rsp),%rax
- movl %edx,%r10d
-
- call _bsaes_decrypt8
-
- pxor 0(%rsp),%xmm15
- pxor 16(%rsp),%xmm0
- movdqu %xmm15,0(%r13)
- movdqu %xmm0,16(%r13)
- leaq 32(%r13),%r13
-
- movdqa 32(%rsp),%xmm6
- jmp L$xts_dec_done
-.p2align 4
-L$xts_dec_1:
- pxor %xmm15,%xmm7
- leaq 16(%r12),%r12
- movdqa %xmm7,32(%rbp)
- leaq 32(%rbp),%rdi
- leaq 32(%rbp),%rsi
- leaq (%r15),%rdx
- call _aes_nohw_decrypt
- pxor 32(%rbp),%xmm15
-
-
-
-
-
- movdqu %xmm15,0(%r13)
- leaq 16(%r13),%r13
-
- movdqa 16(%rsp),%xmm6
-
-L$xts_dec_done:
- andl $15,%ebx
- jz L$xts_dec_ret
-
- pxor %xmm14,%xmm14
- movdqa L$xts_magic(%rip),%xmm12
- pcmpgtd %xmm6,%xmm14
- pshufd $0x13,%xmm14,%xmm13
- movdqa %xmm6,%xmm5
- paddq %xmm6,%xmm6
- pand %xmm12,%xmm13
- movdqu (%r12),%xmm15
- pxor %xmm13,%xmm6
-
- leaq 32(%rbp),%rdi
- pxor %xmm6,%xmm15
- leaq 32(%rbp),%rsi
- movdqa %xmm15,32(%rbp)
- leaq (%r15),%rdx
- call _aes_nohw_decrypt
- pxor 32(%rbp),%xmm6
- movq %r13,%rdx
- movdqu %xmm6,(%r13)
-
-L$xts_dec_steal:
- movzbl 16(%r12),%eax
- movzbl (%rdx),%ecx
- leaq 1(%r12),%r12
- movb %al,(%rdx)
- movb %cl,16(%rdx)
- leaq 1(%rdx),%rdx
- subl $1,%ebx
- jnz L$xts_dec_steal
-
- movdqu (%r13),%xmm15
- leaq 32(%rbp),%rdi
- pxor %xmm5,%xmm15
- leaq 32(%rbp),%rsi
- movdqa %xmm15,32(%rbp)
- leaq (%r15),%rdx
- call _aes_nohw_decrypt
- pxor 32(%rbp),%xmm5
- movdqu %xmm5,(%r13)
-
-L$xts_dec_ret:
- leaq (%rsp),%rax
- pxor %xmm0,%xmm0
-L$xts_dec_bzero:
- movdqa %xmm0,0(%rax)
- movdqa %xmm0,16(%rax)
- leaq 32(%rax),%rax
- cmpq %rax,%rbp
- ja L$xts_dec_bzero
-
- leaq 120(%rbp),%rax
-
- movq -48(%rax),%r15
-
- movq -40(%rax),%r14
-
- movq -32(%rax),%r13
-
- movq -24(%rax),%r12
-
- movq -16(%rax),%rbx
-
- movq -8(%rax),%rbp
-
- leaq (%rax),%rsp
-
-L$xts_dec_epilogue:
- .byte 0xf3,0xc3
-
-
.p2align 6
_bsaes_const:
diff --git a/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
index 8a6d16cd..c550e794 100644
--- a/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
+++ b/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
@@ -24,8 +24,6 @@ _sha512_block_data_order:
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
- testl $2048,%r10d
- jnz L$xop_shortcut
andl $1073741824,%r9d
andl $268435968,%r10d
orl %r9d,%r10d
@@ -1826,1107 +1824,6 @@ K512:
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
-sha512_block_data_order_xop:
-
-L$xop_shortcut:
- movq %rsp,%rax
-
- pushq %rbx
-
- pushq %rbp
-
- pushq %r12
-
- pushq %r13
-
- pushq %r14
-
- pushq %r15
-
- shlq $4,%rdx
- subq $160,%rsp
- leaq (%rsi,%rdx,8),%rdx
- andq $-64,%rsp
- movq %rdi,128+0(%rsp)
- movq %rsi,128+8(%rsp)
- movq %rdx,128+16(%rsp)
- movq %rax,152(%rsp)
-
-L$prologue_xop:
-
- vzeroupper
- movq 0(%rdi),%rax
- movq 8(%rdi),%rbx
- movq 16(%rdi),%rcx
- movq 24(%rdi),%rdx
- movq 32(%rdi),%r8
- movq 40(%rdi),%r9
- movq 48(%rdi),%r10
- movq 56(%rdi),%r11
- jmp L$loop_xop
-.p2align 4
-L$loop_xop:
- vmovdqa K512+1280(%rip),%xmm11
- vmovdqu 0(%rsi),%xmm0
- leaq K512+128(%rip),%rbp
- vmovdqu 16(%rsi),%xmm1
- vmovdqu 32(%rsi),%xmm2
- vpshufb %xmm11,%xmm0,%xmm0
- vmovdqu 48(%rsi),%xmm3
- vpshufb %xmm11,%xmm1,%xmm1
- vmovdqu 64(%rsi),%xmm4
- vpshufb %xmm11,%xmm2,%xmm2
- vmovdqu 80(%rsi),%xmm5
- vpshufb %xmm11,%xmm3,%xmm3
- vmovdqu 96(%rsi),%xmm6
- vpshufb %xmm11,%xmm4,%xmm4
- vmovdqu 112(%rsi),%xmm7
- vpshufb %xmm11,%xmm5,%xmm5
- vpaddq -128(%rbp),%xmm0,%xmm8
- vpshufb %xmm11,%xmm6,%xmm6
- vpaddq -96(%rbp),%xmm1,%xmm9
- vpshufb %xmm11,%xmm7,%xmm7
- vpaddq -64(%rbp),%xmm2,%xmm10
- vpaddq -32(%rbp),%xmm3,%xmm11
- vmovdqa %xmm8,0(%rsp)
- vpaddq 0(%rbp),%xmm4,%xmm8
- vmovdqa %xmm9,16(%rsp)
- vpaddq 32(%rbp),%xmm5,%xmm9
- vmovdqa %xmm10,32(%rsp)
- vpaddq 64(%rbp),%xmm6,%xmm10
- vmovdqa %xmm11,48(%rsp)
- vpaddq 96(%rbp),%xmm7,%xmm11
- vmovdqa %xmm8,64(%rsp)
- movq %rax,%r14
- vmovdqa %xmm9,80(%rsp)
- movq %rbx,%rdi
- vmovdqa %xmm10,96(%rsp)
- xorq %rcx,%rdi
- vmovdqa %xmm11,112(%rsp)
- movq %r8,%r13
- jmp L$xop_00_47
-
-.p2align 4
-L$xop_00_47:
- addq $256,%rbp
- vpalignr $8,%xmm0,%xmm1,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm4,%xmm5,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm0,%xmm0
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,223,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm7,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm0,%xmm0
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm0,%xmm0
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq -128(%rbp),%xmm0,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,0(%rsp)
- vpalignr $8,%xmm1,%xmm2,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm5,%xmm6,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm1,%xmm1
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,216,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm0,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm1,%xmm1
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm1,%xmm1
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq -96(%rbp),%xmm1,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,16(%rsp)
- vpalignr $8,%xmm2,%xmm3,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm6,%xmm7,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm2,%xmm2
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,217,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm1,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm2,%xmm2
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm2,%xmm2
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq -64(%rbp),%xmm2,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,32(%rsp)
- vpalignr $8,%xmm3,%xmm4,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm7,%xmm0,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm3,%xmm3
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,218,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm2,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm3,%xmm3
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm3,%xmm3
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq -32(%rbp),%xmm3,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,48(%rsp)
- vpalignr $8,%xmm4,%xmm5,%xmm8
- rorq $23,%r13
- movq %r14,%rax
- vpalignr $8,%xmm0,%xmm1,%xmm11
- movq %r9,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r8,%r13
- xorq %r10,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rax,%r14
- vpaddq %xmm11,%xmm4,%xmm4
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
-.byte 143,72,120,195,209,7
- xorq %r10,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,219,3
- xorq %rax,%r14
- addq %r13,%r11
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rbx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm3,%xmm10
- addq %r11,%rdx
- addq %rdi,%r11
- vpaddq %xmm8,%xmm4,%xmm4
- movq %rdx,%r13
- addq %r11,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r11
- vpxor %xmm10,%xmm11,%xmm11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- vpaddq %xmm11,%xmm4,%xmm4
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- vpaddq 0(%rbp),%xmm4,%xmm10
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- vmovdqa %xmm10,64(%rsp)
- vpalignr $8,%xmm5,%xmm6,%xmm8
- rorq $23,%r13
- movq %r14,%r10
- vpalignr $8,%xmm1,%xmm2,%xmm11
- movq %rdx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rcx,%r13
- xorq %r8,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r10,%r14
- vpaddq %xmm11,%xmm5,%xmm5
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
-.byte 143,72,120,195,209,7
- xorq %r8,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,220,3
- xorq %r10,%r14
- addq %r13,%r9
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r11,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm4,%xmm10
- addq %r9,%rbx
- addq %rdi,%r9
- vpaddq %xmm8,%xmm5,%xmm5
- movq %rbx,%r13
- addq %r9,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%r9
- vpxor %xmm10,%xmm11,%xmm11
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- vpaddq %xmm11,%xmm5,%xmm5
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- vpaddq 32(%rbp),%xmm5,%xmm10
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- vmovdqa %xmm10,80(%rsp)
- vpalignr $8,%xmm6,%xmm7,%xmm8
- rorq $23,%r13
- movq %r14,%r8
- vpalignr $8,%xmm2,%xmm3,%xmm11
- movq %rbx,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %rax,%r13
- xorq %rcx,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %r8,%r14
- vpaddq %xmm11,%xmm6,%xmm6
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
-.byte 143,72,120,195,209,7
- xorq %rcx,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,221,3
- xorq %r8,%r14
- addq %r13,%rdx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %r9,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm5,%xmm10
- addq %rdx,%r11
- addq %rdi,%rdx
- vpaddq %xmm8,%xmm6,%xmm6
- movq %r11,%r13
- addq %rdx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rdx
- vpxor %xmm10,%xmm11,%xmm11
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- vpaddq %xmm11,%xmm6,%xmm6
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- vpaddq 64(%rbp),%xmm6,%xmm10
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- vmovdqa %xmm10,96(%rsp)
- vpalignr $8,%xmm7,%xmm0,%xmm8
- rorq $23,%r13
- movq %r14,%rcx
- vpalignr $8,%xmm3,%xmm4,%xmm11
- movq %r11,%r12
- rorq $5,%r14
-.byte 143,72,120,195,200,56
- xorq %r10,%r13
- xorq %rax,%r12
- vpsrlq $7,%xmm8,%xmm8
- rorq $4,%r13
- xorq %rcx,%r14
- vpaddq %xmm11,%xmm7,%xmm7
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
-.byte 143,72,120,195,209,7
- xorq %rax,%r12
- rorq $6,%r14
- vpxor %xmm9,%xmm8,%xmm8
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
-.byte 143,104,120,195,222,3
- xorq %rcx,%r14
- addq %r13,%rbx
- vpxor %xmm10,%xmm8,%xmm8
- xorq %rdx,%rdi
- rorq $28,%r14
- vpsrlq $6,%xmm6,%xmm10
- addq %rbx,%r9
- addq %rdi,%rbx
- vpaddq %xmm8,%xmm7,%xmm7
- movq %r9,%r13
- addq %rbx,%r14
-.byte 143,72,120,195,203,42
- rorq $23,%r13
- movq %r14,%rbx
- vpxor %xmm10,%xmm11,%xmm11
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- vpxor %xmm9,%xmm11,%xmm11
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- vpaddq %xmm11,%xmm7,%xmm7
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- vpaddq 96(%rbp),%xmm7,%xmm10
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- vmovdqa %xmm10,112(%rsp)
- cmpb $0,135(%rbp)
- jne L$xop_00_47
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 0(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 8(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 16(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 24(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 32(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 40(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 48(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 56(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- rorq $23,%r13
- movq %r14,%rax
- movq %r9,%r12
- rorq $5,%r14
- xorq %r8,%r13
- xorq %r10,%r12
- rorq $4,%r13
- xorq %rax,%r14
- andq %r8,%r12
- xorq %r8,%r13
- addq 64(%rsp),%r11
- movq %rax,%r15
- xorq %r10,%r12
- rorq $6,%r14
- xorq %rbx,%r15
- addq %r12,%r11
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rax,%r14
- addq %r13,%r11
- xorq %rbx,%rdi
- rorq $28,%r14
- addq %r11,%rdx
- addq %rdi,%r11
- movq %rdx,%r13
- addq %r11,%r14
- rorq $23,%r13
- movq %r14,%r11
- movq %r8,%r12
- rorq $5,%r14
- xorq %rdx,%r13
- xorq %r9,%r12
- rorq $4,%r13
- xorq %r11,%r14
- andq %rdx,%r12
- xorq %rdx,%r13
- addq 72(%rsp),%r10
- movq %r11,%rdi
- xorq %r9,%r12
- rorq $6,%r14
- xorq %rax,%rdi
- addq %r12,%r10
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r11,%r14
- addq %r13,%r10
- xorq %rax,%r15
- rorq $28,%r14
- addq %r10,%rcx
- addq %r15,%r10
- movq %rcx,%r13
- addq %r10,%r14
- rorq $23,%r13
- movq %r14,%r10
- movq %rdx,%r12
- rorq $5,%r14
- xorq %rcx,%r13
- xorq %r8,%r12
- rorq $4,%r13
- xorq %r10,%r14
- andq %rcx,%r12
- xorq %rcx,%r13
- addq 80(%rsp),%r9
- movq %r10,%r15
- xorq %r8,%r12
- rorq $6,%r14
- xorq %r11,%r15
- addq %r12,%r9
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r10,%r14
- addq %r13,%r9
- xorq %r11,%rdi
- rorq $28,%r14
- addq %r9,%rbx
- addq %rdi,%r9
- movq %rbx,%r13
- addq %r9,%r14
- rorq $23,%r13
- movq %r14,%r9
- movq %rcx,%r12
- rorq $5,%r14
- xorq %rbx,%r13
- xorq %rdx,%r12
- rorq $4,%r13
- xorq %r9,%r14
- andq %rbx,%r12
- xorq %rbx,%r13
- addq 88(%rsp),%r8
- movq %r9,%rdi
- xorq %rdx,%r12
- rorq $6,%r14
- xorq %r10,%rdi
- addq %r12,%r8
- rorq $14,%r13
- andq %rdi,%r15
- xorq %r9,%r14
- addq %r13,%r8
- xorq %r10,%r15
- rorq $28,%r14
- addq %r8,%rax
- addq %r15,%r8
- movq %rax,%r13
- addq %r8,%r14
- rorq $23,%r13
- movq %r14,%r8
- movq %rbx,%r12
- rorq $5,%r14
- xorq %rax,%r13
- xorq %rcx,%r12
- rorq $4,%r13
- xorq %r8,%r14
- andq %rax,%r12
- xorq %rax,%r13
- addq 96(%rsp),%rdx
- movq %r8,%r15
- xorq %rcx,%r12
- rorq $6,%r14
- xorq %r9,%r15
- addq %r12,%rdx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %r8,%r14
- addq %r13,%rdx
- xorq %r9,%rdi
- rorq $28,%r14
- addq %rdx,%r11
- addq %rdi,%rdx
- movq %r11,%r13
- addq %rdx,%r14
- rorq $23,%r13
- movq %r14,%rdx
- movq %rax,%r12
- rorq $5,%r14
- xorq %r11,%r13
- xorq %rbx,%r12
- rorq $4,%r13
- xorq %rdx,%r14
- andq %r11,%r12
- xorq %r11,%r13
- addq 104(%rsp),%rcx
- movq %rdx,%rdi
- xorq %rbx,%r12
- rorq $6,%r14
- xorq %r8,%rdi
- addq %r12,%rcx
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rdx,%r14
- addq %r13,%rcx
- xorq %r8,%r15
- rorq $28,%r14
- addq %rcx,%r10
- addq %r15,%rcx
- movq %r10,%r13
- addq %rcx,%r14
- rorq $23,%r13
- movq %r14,%rcx
- movq %r11,%r12
- rorq $5,%r14
- xorq %r10,%r13
- xorq %rax,%r12
- rorq $4,%r13
- xorq %rcx,%r14
- andq %r10,%r12
- xorq %r10,%r13
- addq 112(%rsp),%rbx
- movq %rcx,%r15
- xorq %rax,%r12
- rorq $6,%r14
- xorq %rdx,%r15
- addq %r12,%rbx
- rorq $14,%r13
- andq %r15,%rdi
- xorq %rcx,%r14
- addq %r13,%rbx
- xorq %rdx,%rdi
- rorq $28,%r14
- addq %rbx,%r9
- addq %rdi,%rbx
- movq %r9,%r13
- addq %rbx,%r14
- rorq $23,%r13
- movq %r14,%rbx
- movq %r10,%r12
- rorq $5,%r14
- xorq %r9,%r13
- xorq %r11,%r12
- rorq $4,%r13
- xorq %rbx,%r14
- andq %r9,%r12
- xorq %r9,%r13
- addq 120(%rsp),%rax
- movq %rbx,%rdi
- xorq %r11,%r12
- rorq $6,%r14
- xorq %rcx,%rdi
- addq %r12,%rax
- rorq $14,%r13
- andq %rdi,%r15
- xorq %rbx,%r14
- addq %r13,%rax
- xorq %rcx,%r15
- rorq $28,%r14
- addq %rax,%r8
- addq %r15,%rax
- movq %r8,%r13
- addq %rax,%r14
- movq 128+0(%rsp),%rdi
- movq %r14,%rax
-
- addq 0(%rdi),%rax
- leaq 128(%rsi),%rsi
- addq 8(%rdi),%rbx
- addq 16(%rdi),%rcx
- addq 24(%rdi),%rdx
- addq 32(%rdi),%r8
- addq 40(%rdi),%r9
- addq 48(%rdi),%r10
- addq 56(%rdi),%r11
-
- cmpq 128+16(%rsp),%rsi
-
- movq %rax,0(%rdi)
- movq %rbx,8(%rdi)
- movq %rcx,16(%rdi)
- movq %rdx,24(%rdi)
- movq %r8,32(%rdi)
- movq %r9,40(%rdi)
- movq %r10,48(%rdi)
- movq %r11,56(%rdi)
- jb L$loop_xop
-
- movq 152(%rsp),%rsi
-
- vzeroupper
- movq -48(%rsi),%r15
-
- movq -40(%rsi),%r14
-
- movq -32(%rsi),%r13
-
- movq -24(%rsi),%r12
-
- movq -16(%rsi),%rbp
-
- movq -8(%rsi),%rbx
-
- leaq (%rsi),%rsp
-
-L$epilogue_xop:
- .byte 0xf3,0xc3
-
-
-
-.p2align 6
sha512_block_data_order_avx:
L$avx_shortcut:
diff --git a/sources.bp b/sources.bp
index 2a580c61..1ff2059b 100644
--- a/sources.bp
+++ b/sources.bp
@@ -120,6 +120,7 @@ cc_defaults {
"src/crypto/fipsmodule/bcm.c",
"src/crypto/fipsmodule/is_fips.c",
"src/crypto/hkdf/hkdf.c",
+ "src/crypto/hrss/hrss.c",
"src/crypto/lhash/lhash.c",
"src/crypto/mem.c",
"src/crypto/obj/obj.c",
@@ -305,6 +306,7 @@ cc_defaults {
"linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
"linux-x86_64/crypto/fipsmodule/x86_64-mont.S",
"linux-x86_64/crypto/fipsmodule/x86_64-mont5.S",
+ "src/crypto/hrss/asm/poly_rq_mul.S",
],
},
},
@@ -420,6 +422,7 @@ cc_defaults {
"src/crypto/fipsmodule/rand/ctrdrbg_test.cc",
"src/crypto/hkdf/hkdf_test.cc",
"src/crypto/hmac_extra/hmac_test.cc",
+ "src/crypto/hrss/hrss_test.cc",
"src/crypto/lhash/lhash_test.cc",
"src/crypto/obj/obj_test.cc",
"src/crypto/pem/pem_test.cc",
diff --git a/sources.mk b/sources.mk
index b25a8424..bf72cbad 100644
--- a/sources.mk
+++ b/sources.mk
@@ -118,6 +118,7 @@ crypto_sources := \
src/crypto/fipsmodule/bcm.c\
src/crypto/fipsmodule/is_fips.c\
src/crypto/hkdf/hkdf.c\
+ src/crypto/hrss/hrss.c\
src/crypto/lhash/lhash.c\
src/crypto/mem.c\
src/crypto/obj/obj.c\
@@ -299,4 +300,5 @@ linux_x86_64_sources := \
linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S\
linux-x86_64/crypto/fipsmodule/x86_64-mont.S\
linux-x86_64/crypto/fipsmodule/x86_64-mont5.S\
+ src/crypto/hrss/asm/poly_rq_mul.S\
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index b1ca70e1..bf696493 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -131,6 +131,7 @@ if(${ARCH} STREQUAL "x86_64")
chacha/chacha-x86_64.${ASM_EXT}
cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT}
cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT}
+ hrss/asm/poly_rq_mul.S
)
endif()
@@ -275,6 +276,7 @@ add_library(
evp/sign.c
ex_data.c
hkdf/hkdf.c
+ hrss/hrss.c
lhash/lhash.c
mem.c
obj/obj.c
@@ -455,6 +457,7 @@ add_executable(
fipsmodule/rand/ctrdrbg_test.cc
hkdf/hkdf_test.cc
hmac_extra/hmac_test.cc
+ hrss/hrss_test.cc
lhash/lhash_test.cc
obj/obj_test.cc
pem/pem_test.cc
diff --git a/src/crypto/bio/bio.c b/src/crypto/bio/bio.c
index fe40578b..7d97c3e7 100644
--- a/src/crypto/bio/bio.c
+++ b/src/crypto/bio/bio.c
@@ -482,6 +482,31 @@ static int bio_read_all(BIO *bio, uint8_t **out, size_t *out_len,
}
}
+// bio_read_full reads |len| bytes |bio| and writes them into |out|. It
+// tolerates partial reads from |bio| and returns one on success or zero if a
+// read fails before |len| bytes are read. On failure, it additionally sets
+// |*out_eof_on_first_read| to whether the error was due to |bio| returning zero
+// on the first read. |out_eof_on_first_read| may be NULL to discard the value.
+static int bio_read_full(BIO *bio, uint8_t *out, int *out_eof_on_first_read,
+ size_t len) {
+ int first_read = 1;
+ while (len > 0) {
+ int todo = len <= INT_MAX ? (int)len : INT_MAX;
+ int ret = BIO_read(bio, out, todo);
+ if (ret <= 0) {
+ if (out_eof_on_first_read != NULL) {
+ *out_eof_on_first_read = first_read && ret == 0;
+ }
+ return 0;
+ }
+ out += ret;
+ len -= (size_t)ret;
+ first_read = 0;
+ }
+
+ return 1;
+}
+
// For compatibility with existing |d2i_*_bio| callers, |BIO_read_asn1| uses
// |ERR_LIB_ASN1| errors.
OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_DECODE_ERROR)
@@ -493,17 +518,16 @@ int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) {
uint8_t header[6];
static const size_t kInitialHeaderLen = 2;
- int ret = BIO_read(bio, header, kInitialHeaderLen);
- if (ret == 0) {
- // Historically, OpenSSL returned |ASN1_R_HEADER_TOO_LONG| when |d2i_*_bio|
- // could not read anything. CPython conditions on this to determine if |bio|
- // was empty.
- OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG);
- return 0;
- }
-
- if (ret != (int) kInitialHeaderLen) {
- OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA);
+ int eof_on_first_read;
+ if (!bio_read_full(bio, header, &eof_on_first_read, kInitialHeaderLen)) {
+ if (eof_on_first_read) {
+ // Historically, OpenSSL returned |ASN1_R_HEADER_TOO_LONG| when
+ // |d2i_*_bio| could not read anything. CPython conditions on this to
+ // determine if |bio| was empty.
+ OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG);
+ } else {
+ OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA);
+ }
return 0;
}
@@ -539,8 +563,7 @@ int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) {
return 0;
}
- if (BIO_read(bio, header + kInitialHeaderLen, num_bytes) !=
- (int)num_bytes) {
+ if (!bio_read_full(bio, header + kInitialHeaderLen, NULL, num_bytes)) {
OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA);
return 0;
}
@@ -582,8 +605,7 @@ int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) {
return 0;
}
OPENSSL_memcpy(*out, header, header_len);
- if (BIO_read(bio, (*out) + header_len, len - header_len) !=
- (int) (len - header_len)) {
+ if (!bio_read_full(bio, (*out) + header_len, NULL, len - header_len)) {
OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA);
OPENSSL_free(*out);
return 0;
diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c
index 1deb9181..71a71fac 100644
--- a/src/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/src/crypto/cipher_extra/e_aesgcmsiv.c
@@ -27,7 +27,11 @@
#define EVP_AEAD_AES_GCM_SIV_NONCE_LEN 12
#define EVP_AEAD_AES_GCM_SIV_TAG_LEN 16
-#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
+// TODO(davidben): AES-GCM-SIV assembly is not correct for Windows. It must save
+// and restore xmm6 through xmm15.
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
+ !defined(OPENSSL_WINDOWS)
+#define AES_GCM_SIV_ASM
// Optimised AES-GCM-SIV
@@ -60,10 +64,10 @@ static struct aead_aes_gcm_siv_asm_ctx *asm_ctx_from_ctx(
extern void aes128gcmsiv_aes_ks(
const uint8_t key[16], uint8_t out_expanded_key[16*15]);
-// aes128gcmsiv_aes_ks writes an AES-128 key schedule for |key| to
+// aes256gcmsiv_aes_ks writes an AES-256 key schedule for |key| to
// |out_expanded_key|.
extern void aes256gcmsiv_aes_ks(
- const uint8_t key[16], uint8_t out_expanded_key[16*15]);
+ const uint8_t key[32], uint8_t out_expanded_key[16*15]);
static int aead_aes_gcm_siv_asm_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t tag_len) {
@@ -549,7 +553,7 @@ static const EVP_AEAD aead_aes_256_gcm_siv_asm = {
NULL /* tag_len */,
};
-#endif // X86_64 && !NO_ASM
+#endif // X86_64 && !NO_ASM && !WINDOWS
struct aead_aes_gcm_siv_ctx {
union {
@@ -838,7 +842,7 @@ static const EVP_AEAD aead_aes_256_gcm_siv = {
NULL /* tag_len */,
};
-#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
+#if defined(AES_GCM_SIV_ASM)
static char avx_aesni_capable(void) {
const uint32_t ecx = OPENSSL_ia32cap_P[1];
@@ -871,4 +875,4 @@ const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) {
return &aead_aes_256_gcm_siv;
}
-#endif // X86_64 && !NO_ASM
+#endif // AES_GCM_SIV_ASM
diff --git a/src/crypto/cpu-intel.c b/src/crypto/cpu-intel.c
index 20cfbe8c..98d8d4e5 100644
--- a/src/crypto/cpu-intel.c
+++ b/src/crypto/cpu-intel.c
@@ -148,23 +148,6 @@ void OPENSSL_cpuid_setup(void) {
int is_intel = ebx == 0x756e6547 /* Genu */ &&
edx == 0x49656e69 /* ineI */ &&
ecx == 0x6c65746e /* ntel */;
- int is_amd = ebx == 0x68747541 /* Auth */ &&
- edx == 0x69746e65 /* enti */ &&
- ecx == 0x444d4163 /* cAMD */;
-
- int has_amd_xop = 0;
- if (is_amd) {
- // AMD-specific logic.
- // See http://developer.amd.com/wordpress/media/2012/10/254811.pdf
- OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000000);
- uint32_t num_extended_ids = eax;
- if (num_extended_ids >= 0x80000001) {
- OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000001);
- if (ecx & (1u << 11)) {
- has_amd_xop = 1;
- }
- }
- }
uint32_t extended_features[2] = {0};
if (num_ids >= 7) {
@@ -173,29 +156,11 @@ void OPENSSL_cpuid_setup(void) {
extended_features[1] = ecx;
}
- // Determine the number of cores sharing an L1 data cache to adjust the
- // hyper-threading bit.
- uint32_t cores_per_cache = 0;
- if (is_amd) {
- // AMD CPUs never share an L1 data cache between threads but do set the HTT
- // bit on multi-core CPUs.
- cores_per_cache = 1;
- } else if (num_ids >= 4) {
- // TODO(davidben): The Intel manual says this CPUID leaf enumerates all
- // caches using ECX and doesn't say which is first. Does this matter?
- OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 4);
- cores_per_cache = 1 + ((eax >> 14) & 0xfff);
- }
-
OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1);
- // Adjust the hyper-threading bit.
- if (edx & (1u << 28)) {
- uint32_t num_logical_cores = (ebx >> 16) & 0xff;
- if (cores_per_cache == 1 || num_logical_cores <= 1) {
- edx &= ~(1u << 28);
- }
- }
+ // Force the hyper-threading bit so that the more conservative path is always
+ // chosen.
+ edx |= 1u << 28;
// Reserved bit #20 was historically repurposed to control the in-memory
// representation of RC4 state. Always set it to zero.
@@ -216,12 +181,9 @@ void OPENSSL_cpuid_setup(void) {
edx &= ~(1u << 30);
}
- // The SDBG bit is repurposed to denote AMD XOP support.
- if (has_amd_xop) {
- ecx |= (1u << 11);
- } else {
- ecx &= ~(1u << 11);
- }
+ // The SDBG bit is repurposed to denote AMD XOP support. Don't ever use AMD
+ // XOP code paths.
+ ecx &= ~(1u << 11);
uint64_t xcr0 = 0;
if (ecx & (1u << 27)) {
diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
index 11b7a9d6..c0ade374 100644
--- a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
+++ b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl
@@ -114,6 +114,7 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
+my $xts=0; # Also patch out the XTS subroutines.
{
my ($key,$rounds,$const)=("%rax","%r10d","%r11");
@@ -2163,6 +2164,8 @@ ___
# const AES_KEY *key1, const AES_KEY *key2,
# const unsigned char iv[16]);
#
+# We patch out the XTS implementation in BoringSSL.
+if ($xts) {
my ($twmask,$twres,$twtmp)=@XMM[13..15];
$arg6=~s/d$//;
@@ -2991,6 +2994,7 @@ $code.=<<___;
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
___
}
+} # $xts
$code.=<<___;
.type _bsaes_const,\@object
.align 64
@@ -3172,7 +3176,8 @@ $code.=<<___;
.rva .Lctr_enc_prologue
.rva .Lctr_enc_epilogue
.rva .Lctr_enc_info
-
+___
+$code.=<<___ if ($xts);
.rva .Lxts_enc_prologue
.rva .Lxts_enc_epilogue
.rva .Lxts_enc_info
@@ -3180,6 +3185,8 @@ $code.=<<___;
.rva .Lxts_dec_prologue
.rva .Lxts_dec_epilogue
.rva .Lxts_dec_info
+___
+$code.=<<___;
.section .xdata
.align 8
@@ -3211,6 +3218,8 @@ $code.=<<___;
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
.rva .Lctr_enc_tail
.long 0
+___
+$code.=<<___ if ($xts);
.Lxts_enc_info:
.byte 9,0,0,0
.rva se_handler
diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h
index a9f8a8c7..5f9ee312 100644
--- a/src/crypto/fipsmodule/aes/internal.h
+++ b/src/crypto/fipsmodule/aes/internal.h
@@ -31,7 +31,7 @@ extern "C" {
#define HWAES_ECB
static int hwaes_capable(void) {
- return (OPENSSL_ia32cap_P[1] & (1 << (57 - 32))) != 0;
+ return (OPENSSL_ia32cap_get()[1] & (1 << (57 - 32))) != 0;
}
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#define HWAES
diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c
index 068465b3..2ccec442 100644
--- a/src/crypto/fipsmodule/cipher/e_aes.c
+++ b/src/crypto/fipsmodule/cipher/e_aes.c
@@ -102,7 +102,7 @@ typedef struct {
(defined(OPENSSL_X86_64) || defined(OPENSSL_X86))
#define VPAES
static char vpaes_capable(void) {
- return (OPENSSL_ia32cap_P[1] & (1 << (41 - 32))) != 0;
+ return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
}
#if defined(OPENSSL_X86_64)
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index b3422149..ef1ccef7 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -581,7 +581,7 @@ static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group,
EC_SCALAR *out,
const EC_SCALAR *in) {
- if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+ if ((OPENSSL_ia32cap_get()[1] & (1 << 28)) == 0) {
// No AVX support; fallback to generic code.
return ec_GFp_simple_mont_inv_mod_ord_vartime(group, out, in);
}
diff --git a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 9d53ec47..49278506 100755
--- a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -108,6 +108,8 @@
# part, body_00_15; reducing the amount of SIMD instructions
# below certain limit makes no difference/sense; to conserve
# space SHA256 XOP code path is therefore omitted;
+#
+# Modified from upstream OpenSSL to remove the XOP code.
$flavour = shift;
$output = shift;
@@ -275,9 +277,7 @@ $code.=<<___ if ($SZ==4 && $shaext);
test \$`1<<29`,%r11d # check for SHA
jnz _shaext_shortcut
___
-$code.=<<___ if ($avx && $SZ==8);
- test \$`1<<11`,%r10d # check for XOP
- jnz .Lxop_shortcut
+ # XOP codepath removed.
___
$code.=<<___ if ($avx>1);
and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
@@ -1127,399 +1127,6 @@ ___
if ($avx) {{
######################################################################
-# XOP code path
-#
-if ($SZ==8) { # SHA512 only
-$code.=<<___;
-.type ${func}_xop,\@function,3
-.align 64
-${func}_xop:
-.cfi_startproc
-.Lxop_shortcut:
- mov %rsp,%rax # copy %rsp
-.cfi_def_cfa_register %rax
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- shl \$4,%rdx # num*16
- sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
- lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
- and \$-64,%rsp # align stack frame
- mov $ctx,$_ctx # save ctx, 1st arg
- mov $inp,$_inp # save inp, 2nd arh
- mov %rdx,$_end # save end pointer, "3rd" arg
- mov %rax,$_rsp # save copy of %rsp
-.cfi_cfa_expression $_rsp,deref,+8
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,16*$SZ+32(%rsp)
- movaps %xmm7,16*$SZ+48(%rsp)
- movaps %xmm8,16*$SZ+64(%rsp)
- movaps %xmm9,16*$SZ+80(%rsp)
-___
-$code.=<<___ if ($win64 && $SZ>4);
- movaps %xmm10,16*$SZ+96(%rsp)
- movaps %xmm11,16*$SZ+112(%rsp)
-___
-$code.=<<___;
-.Lprologue_xop:
-
- vzeroupper
- mov $SZ*0($ctx),$A
- mov $SZ*1($ctx),$B
- mov $SZ*2($ctx),$C
- mov $SZ*3($ctx),$D
- mov $SZ*4($ctx),$E
- mov $SZ*5($ctx),$F
- mov $SZ*6($ctx),$G
- mov $SZ*7($ctx),$H
- jmp .Lloop_xop
-___
- if ($SZ==4) { # SHA256
- my @X = map("%xmm$_",(0..3));
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
-
-$code.=<<___;
-.align 16
-.Lloop_xop:
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
- vmovdqu 0x00($inp),@X[0]
- vmovdqu 0x10($inp),@X[1]
- vmovdqu 0x20($inp),@X[2]
- vmovdqu 0x30($inp),@X[3]
- vpshufb $t3,@X[0],@X[0]
- lea $TABLE(%rip),$Tbl
- vpshufb $t3,@X[1],@X[1]
- vpshufb $t3,@X[2],@X[2]
- vpaddd 0x00($Tbl),@X[0],$t0
- vpshufb $t3,@X[3],@X[3]
- vpaddd 0x20($Tbl),@X[1],$t1
- vpaddd 0x40($Tbl),@X[2],$t2
- vpaddd 0x60($Tbl),@X[3],$t3
- vmovdqa $t0,0x00(%rsp)
- mov $A,$a1
- vmovdqa $t1,0x10(%rsp)
- mov $B,$a3
- vmovdqa $t2,0x20(%rsp)
- xor $C,$a3 # magic
- vmovdqa $t3,0x30(%rsp)
- mov $E,$a0
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- sub \$`-16*2*$SZ`,$Tbl # size optimization
-___
-sub XOP_256_00_47 () {
-my $j = shift;
-my $body = shift;
-my @X = @_;
-my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
-
- &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrld ($t0,$t0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t1);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrld ($t2,@X[3],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrldq ($t3,$t3,8);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrld ($t2,@X[0],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpslldq ($t3,$t3,8); # 22 instructions
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
- foreach (@insns) { eval; } # remaining instructions
- &vmovdqa (16*$j."(%rsp)",$t2);
-}
-
- for ($i=0,$j=0; $j<4; $j++) {
- &XOP_256_00_47($j,\&body_00_15,@X);
- push(@X,shift(@X)); # rotate(@X)
- }
- &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
- &jne (".Lxop_00_47");
-
- for ($i=0; $i<16; ) {
- foreach(body_00_15()) { eval; }
- }
-
- } else { # SHA512
- my @X = map("%xmm$_",(0..7));
- my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
-
-$code.=<<___;
-.align 16
-.Lloop_xop:
- vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
- vmovdqu 0x00($inp),@X[0]
- lea $TABLE+0x80(%rip),$Tbl # size optimization
- vmovdqu 0x10($inp),@X[1]
- vmovdqu 0x20($inp),@X[2]
- vpshufb $t3,@X[0],@X[0]
- vmovdqu 0x30($inp),@X[3]
- vpshufb $t3,@X[1],@X[1]
- vmovdqu 0x40($inp),@X[4]
- vpshufb $t3,@X[2],@X[2]
- vmovdqu 0x50($inp),@X[5]
- vpshufb $t3,@X[3],@X[3]
- vmovdqu 0x60($inp),@X[6]
- vpshufb $t3,@X[4],@X[4]
- vmovdqu 0x70($inp),@X[7]
- vpshufb $t3,@X[5],@X[5]
- vpaddq -0x80($Tbl),@X[0],$t0
- vpshufb $t3,@X[6],@X[6]
- vpaddq -0x60($Tbl),@X[1],$t1
- vpshufb $t3,@X[7],@X[7]
- vpaddq -0x40($Tbl),@X[2],$t2
- vpaddq -0x20($Tbl),@X[3],$t3
- vmovdqa $t0,0x00(%rsp)
- vpaddq 0x00($Tbl),@X[4],$t0
- vmovdqa $t1,0x10(%rsp)
- vpaddq 0x20($Tbl),@X[5],$t1
- vmovdqa $t2,0x20(%rsp)
- vpaddq 0x40($Tbl),@X[6],$t2
- vmovdqa $t3,0x30(%rsp)
- vpaddq 0x60($Tbl),@X[7],$t3
- vmovdqa $t0,0x40(%rsp)
- mov $A,$a1
- vmovdqa $t1,0x50(%rsp)
- mov $B,$a3
- vmovdqa $t2,0x60(%rsp)
- xor $C,$a3 # magic
- vmovdqa $t3,0x70(%rsp)
- mov $E,$a0
- jmp .Lxop_00_47
-
-.align 16
-.Lxop_00_47:
- add \$`16*2*$SZ`,$Tbl
-___
-sub XOP_512_00_47 () {
-my $j = shift;
-my $body = shift;
-my @X = @_;
-my @insns = (&$body,&$body); # 52 instructions
-
- &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
- eval(shift(@insns));
- eval(shift(@insns));
- &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrlq ($t0,$t0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t1);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
- eval(shift(@insns));
- eval(shift(@insns));
- &vpsrlq ($t2,@X[7],$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
- eval(shift(@insns));
- eval(shift(@insns));
- &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t2);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
- foreach (@insns) { eval; } # remaining instructions
- &vmovdqa (16*$j."(%rsp)",$t2);
-}
-
- for ($i=0,$j=0; $j<8; $j++) {
- &XOP_512_00_47($j,\&body_00_15,@X);
- push(@X,shift(@X)); # rotate(@X)
- }
- &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
- &jne (".Lxop_00_47");
-
- for ($i=0; $i<16; ) {
- foreach(body_00_15()) { eval; }
- }
-}
-$code.=<<___;
- mov $_ctx,$ctx
- mov $a1,$A
-
- add $SZ*0($ctx),$A
- lea 16*$SZ($inp),$inp
- add $SZ*1($ctx),$B
- add $SZ*2($ctx),$C
- add $SZ*3($ctx),$D
- add $SZ*4($ctx),$E
- add $SZ*5($ctx),$F
- add $SZ*6($ctx),$G
- add $SZ*7($ctx),$H
-
- cmp $_end,$inp
-
- mov $A,$SZ*0($ctx)
- mov $B,$SZ*1($ctx)
- mov $C,$SZ*2($ctx)
- mov $D,$SZ*3($ctx)
- mov $E,$SZ*4($ctx)
- mov $F,$SZ*5($ctx)
- mov $G,$SZ*6($ctx)
- mov $H,$SZ*7($ctx)
- jb .Lloop_xop
-
- mov $_rsp,%rsi
-.cfi_def_cfa %rsi,8
- vzeroupper
-___
-$code.=<<___ if ($win64);
- movaps 16*$SZ+32(%rsp),%xmm6
- movaps 16*$SZ+48(%rsp),%xmm7
- movaps 16*$SZ+64(%rsp),%xmm8
- movaps 16*$SZ+80(%rsp),%xmm9
-___
-$code.=<<___ if ($win64 && $SZ>4);
- movaps 16*$SZ+96(%rsp),%xmm10
- movaps 16*$SZ+112(%rsp),%xmm11
-___
-$code.=<<___;
- mov -48(%rsi),%r15
-.cfi_restore %r15
- mov -40(%rsi),%r14
-.cfi_restore %r14
- mov -32(%rsi),%r13
-.cfi_restore %r13
- mov -24(%rsi),%r12
-.cfi_restore %r12
- mov -16(%rsi),%rbp
-.cfi_restore %rbp
- mov -8(%rsi),%rbx
-.cfi_restore %rbx
- lea (%rsi),%rsp
-.cfi_def_cfa_register %rsp
-.Lepilogue_xop:
- ret
-.cfi_endproc
-.size ${func}_xop,.-${func}_xop
-___
-}
-######################################################################
# AVX+shrd code path
#
local *ror = sub { &shrd(@_[0],@_) };
@@ -2409,11 +2016,6 @@ $code.=<<___ if ($SZ==4);
.rva .LSEH_end_${func}_ssse3
.rva .LSEH_info_${func}_ssse3
___
-$code.=<<___ if ($avx && $SZ==8);
- .rva .LSEH_begin_${func}_xop
- .rva .LSEH_end_${func}_xop
- .rva .LSEH_info_${func}_xop
-___
$code.=<<___ if ($avx);
.rva .LSEH_begin_${func}_avx
.rva .LSEH_end_${func}_avx
@@ -2443,12 +2045,6 @@ $code.=<<___ if ($SZ==4);
.rva se_handler
.rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
___
-$code.=<<___ if ($avx && $SZ==8);
-.LSEH_info_${func}_xop:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
-___
$code.=<<___ if ($avx);
.LSEH_info_${func}_avx:
.byte 9,0,0,0
diff --git a/src/crypto/hrss/asm/poly_rq_mul.S b/src/crypto/hrss/asm/poly_rq_mul.S
new file mode 100644
index 00000000..0ad0fb51
--- /dev/null
+++ b/src/crypto/hrss/asm/poly_rq_mul.S
@@ -0,0 +1,8457 @@
+// Copyright (c) 2017, the HRSS authors.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && defined(__linux__)
+
+// This is the polynomial multiplication function from [HRSS], provided by kind
+// permission of the authors.
+//
+// HRSS: https://eprint.iacr.org/2017/1005
+
+# This file was generated by poly_rq_mul.py
+.text
+.align 32
+mask_low9words:
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+const3:
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+const9:
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+const0:
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+const729:
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+const3_inv:
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+const5_inv:
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+shuf48_16:
+.byte 10
+.byte 11
+.byte 12
+.byte 13
+.byte 14
+.byte 15
+.byte 0
+.byte 1
+.byte 2
+.byte 3
+.byte 4
+.byte 5
+.byte 6
+.byte 7
+.byte 8
+.byte 9
+.byte 10
+.byte 11
+.byte 12
+.byte 13
+.byte 14
+.byte 15
+.byte 0
+.byte 1
+.byte 2
+.byte 3
+.byte 4
+.byte 5
+.byte 6
+.byte 7
+.byte 8
+.byte 9
+shufmin1_mask3:
+.byte 2
+.byte 3
+.byte 4
+.byte 5
+.byte 6
+.byte 7
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+mask32_to_16:
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+mask5_3_5_3:
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+mask3_5_3_5:
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+mask3_5_4_3_1:
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 0
+mask_keephigh:
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+mask_mod8192:
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.text
+.global poly_Rq_mul
+.hidden poly_Rq_mul
+.att_syntax prefix
+poly_Rq_mul:
+.cfi_startproc
+push %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp, -16
+movq %rsp, %rbp
+.cfi_def_cfa_register rbp
+push %r12
+.cfi_offset r12, -24
+mov %rsp, %r8
+andq $-32, %rsp
+subq $6144, %rsp
+mov %rsp, %rax
+subq $6144, %rsp
+mov %rsp, %r11
+subq $12288, %rsp
+mov %rsp, %r12
+subq $512, %rsp
+vmovdqa const3(%rip), %ymm3
+vmovdqu 0(%rsi), %ymm0
+vmovdqu 88(%rsi), %ymm1
+vmovdqu 176(%rsi), %ymm2
+vmovdqu 264(%rsi), %ymm12
+vmovdqu 1056(%rsi), %ymm4
+vmovdqu 1144(%rsi), %ymm5
+vmovdqu 1232(%rsi), %ymm6
+vmovdqu 1320(%rsi), %ymm7
+vmovdqu 352(%rsi), %ymm8
+vmovdqu 440(%rsi), %ymm9
+vmovdqu 528(%rsi), %ymm10
+vmovdqu 616(%rsi), %ymm11
+vmovdqa %ymm0, 0(%rax)
+vmovdqa %ymm1, 96(%rax)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 192(%rax)
+vmovdqa %ymm2, 288(%rax)
+vmovdqa %ymm12, 384(%rax)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 480(%rax)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 576(%rax)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 672(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 768(%rax)
+vmovdqa %ymm4, 5184(%rax)
+vmovdqa %ymm5, 5280(%rax)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5376(%rax)
+vmovdqa %ymm6, 5472(%rax)
+vmovdqa %ymm7, 5568(%rax)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5664(%rax)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5760(%rax)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5856(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5952(%rax)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 704(%rsi), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 792(%rsi), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 880(%rsi), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 968(%rsi), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 864(%rax)
+vmovdqa %ymm9, 960(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1056(%rax)
+vmovdqa %ymm10, 1152(%rax)
+vmovdqa %ymm11, 1248(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1344(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1440(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1536(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1632(%rax)
+vmovdqa %ymm12, 1728(%rax)
+vmovdqa %ymm13, 1824(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1920(%rax)
+vmovdqa %ymm14, 2016(%rax)
+vmovdqa %ymm15, 2112(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2208(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2304(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2400(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2496(%rax)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2592(%rax)
+vmovdqa %ymm9, 2688(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2784(%rax)
+vmovdqa %ymm10, 2880(%rax)
+vmovdqa %ymm11, 2976(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3072(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3168(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3264(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3360(%rax)
+vmovdqa %ymm12, 3456(%rax)
+vmovdqa %ymm13, 3552(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3648(%rax)
+vmovdqa %ymm14, 3744(%rax)
+vmovdqa %ymm15, 3840(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3936(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4032(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4128(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4224(%rax)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4320(%rax)
+vmovdqa %ymm13, 4416(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4512(%rax)
+vmovdqa %ymm14, 4608(%rax)
+vmovdqa %ymm15, 4704(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4800(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4896(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4992(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5088(%rax)
+vmovdqu 32(%rsi), %ymm0
+vmovdqu 120(%rsi), %ymm1
+vmovdqu 208(%rsi), %ymm2
+vmovdqu 296(%rsi), %ymm12
+vmovdqu 1088(%rsi), %ymm4
+vmovdqu 1176(%rsi), %ymm5
+vmovdqu 1264(%rsi), %ymm6
+vmovdqu 1352(%rsi), %ymm7
+vmovdqu 384(%rsi), %ymm8
+vmovdqu 472(%rsi), %ymm9
+vmovdqu 560(%rsi), %ymm10
+vmovdqu 648(%rsi), %ymm11
+vmovdqa %ymm0, 32(%rax)
+vmovdqa %ymm1, 128(%rax)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 224(%rax)
+vmovdqa %ymm2, 320(%rax)
+vmovdqa %ymm12, 416(%rax)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 512(%rax)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 608(%rax)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 704(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 800(%rax)
+vmovdqa %ymm4, 5216(%rax)
+vmovdqa %ymm5, 5312(%rax)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5408(%rax)
+vmovdqa %ymm6, 5504(%rax)
+vmovdqa %ymm7, 5600(%rax)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5696(%rax)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5792(%rax)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5888(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5984(%rax)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 736(%rsi), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 824(%rsi), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 912(%rsi), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1000(%rsi), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 896(%rax)
+vmovdqa %ymm9, 992(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1088(%rax)
+vmovdqa %ymm10, 1184(%rax)
+vmovdqa %ymm11, 1280(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1376(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1472(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1568(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1664(%rax)
+vmovdqa %ymm12, 1760(%rax)
+vmovdqa %ymm13, 1856(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1952(%rax)
+vmovdqa %ymm14, 2048(%rax)
+vmovdqa %ymm15, 2144(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2240(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2336(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2432(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2528(%rax)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2624(%rax)
+vmovdqa %ymm9, 2720(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2816(%rax)
+vmovdqa %ymm10, 2912(%rax)
+vmovdqa %ymm11, 3008(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3104(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3200(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3296(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3392(%rax)
+vmovdqa %ymm12, 3488(%rax)
+vmovdqa %ymm13, 3584(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3680(%rax)
+vmovdqa %ymm14, 3776(%rax)
+vmovdqa %ymm15, 3872(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3968(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4064(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4160(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4256(%rax)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4352(%rax)
+vmovdqa %ymm13, 4448(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4544(%rax)
+vmovdqa %ymm14, 4640(%rax)
+vmovdqa %ymm15, 4736(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4832(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4928(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5024(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5120(%rax)
+vmovdqu 64(%rsi), %ymm0
+vmovdqu 152(%rsi), %ymm1
+vmovdqu 240(%rsi), %ymm2
+vmovdqu 328(%rsi), %ymm12
+vmovdqu 1120(%rsi), %ymm4
+vmovdqu 1208(%rsi), %ymm5
+vmovdqu 1296(%rsi), %ymm6
+vmovdqu 1384(%rsi), %ymm7
+vpand mask_low9words(%rip), %ymm7, %ymm7
+vmovdqu 416(%rsi), %ymm8
+vmovdqu 504(%rsi), %ymm9
+vmovdqu 592(%rsi), %ymm10
+vmovdqu 680(%rsi), %ymm11
+vmovdqa %ymm0, 64(%rax)
+vmovdqa %ymm1, 160(%rax)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 256(%rax)
+vmovdqa %ymm2, 352(%rax)
+vmovdqa %ymm12, 448(%rax)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 544(%rax)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 640(%rax)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 736(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 832(%rax)
+vmovdqa %ymm4, 5248(%rax)
+vmovdqa %ymm5, 5344(%rax)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5440(%rax)
+vmovdqa %ymm6, 5536(%rax)
+vmovdqa %ymm7, 5632(%rax)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5728(%rax)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5824(%rax)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5920(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 6016(%rax)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 768(%rsi), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 856(%rsi), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 944(%rsi), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1032(%rsi), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 928(%rax)
+vmovdqa %ymm9, 1024(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1120(%rax)
+vmovdqa %ymm10, 1216(%rax)
+vmovdqa %ymm11, 1312(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1408(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1504(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1600(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1696(%rax)
+vmovdqa %ymm12, 1792(%rax)
+vmovdqa %ymm13, 1888(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1984(%rax)
+vmovdqa %ymm14, 2080(%rax)
+vmovdqa %ymm15, 2176(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2272(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2368(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2464(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2560(%rax)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2656(%rax)
+vmovdqa %ymm9, 2752(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2848(%rax)
+vmovdqa %ymm10, 2944(%rax)
+vmovdqa %ymm11, 3040(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3136(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3232(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3328(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3424(%rax)
+vmovdqa %ymm12, 3520(%rax)
+vmovdqa %ymm13, 3616(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3712(%rax)
+vmovdqa %ymm14, 3808(%rax)
+vmovdqa %ymm15, 3904(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4000(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4096(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4192(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4288(%rax)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4384(%rax)
+vmovdqa %ymm13, 4480(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4576(%rax)
+vmovdqa %ymm14, 4672(%rax)
+vmovdqa %ymm15, 4768(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4864(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4960(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5056(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5152(%rax)
+vmovdqu 0(%rdx), %ymm0
+vmovdqu 88(%rdx), %ymm1
+vmovdqu 176(%rdx), %ymm2
+vmovdqu 264(%rdx), %ymm12
+vmovdqu 1056(%rdx), %ymm4
+vmovdqu 1144(%rdx), %ymm5
+vmovdqu 1232(%rdx), %ymm6
+vmovdqu 1320(%rdx), %ymm7
+vmovdqu 352(%rdx), %ymm8
+vmovdqu 440(%rdx), %ymm9
+vmovdqu 528(%rdx), %ymm10
+vmovdqu 616(%rdx), %ymm11
+vmovdqa %ymm0, 0(%r11)
+vmovdqa %ymm1, 96(%r11)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 192(%r11)
+vmovdqa %ymm2, 288(%r11)
+vmovdqa %ymm12, 384(%r11)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 480(%r11)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 576(%r11)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 672(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 768(%r11)
+vmovdqa %ymm4, 5184(%r11)
+vmovdqa %ymm5, 5280(%r11)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5376(%r11)
+vmovdqa %ymm6, 5472(%r11)
+vmovdqa %ymm7, 5568(%r11)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5664(%r11)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5760(%r11)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5856(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5952(%r11)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 704(%rdx), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 792(%rdx), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 880(%rdx), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 968(%rdx), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 864(%r11)
+vmovdqa %ymm9, 960(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1056(%r11)
+vmovdqa %ymm10, 1152(%r11)
+vmovdqa %ymm11, 1248(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1344(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1440(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1536(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1632(%r11)
+vmovdqa %ymm12, 1728(%r11)
+vmovdqa %ymm13, 1824(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1920(%r11)
+vmovdqa %ymm14, 2016(%r11)
+vmovdqa %ymm15, 2112(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2208(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2304(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2400(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2496(%r11)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2592(%r11)
+vmovdqa %ymm9, 2688(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2784(%r11)
+vmovdqa %ymm10, 2880(%r11)
+vmovdqa %ymm11, 2976(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3072(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3168(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3264(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3360(%r11)
+vmovdqa %ymm12, 3456(%r11)
+vmovdqa %ymm13, 3552(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3648(%r11)
+vmovdqa %ymm14, 3744(%r11)
+vmovdqa %ymm15, 3840(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3936(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4032(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4128(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4224(%r11)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4320(%r11)
+vmovdqa %ymm13, 4416(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4512(%r11)
+vmovdqa %ymm14, 4608(%r11)
+vmovdqa %ymm15, 4704(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4800(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4896(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4992(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5088(%r11)
+vmovdqu 32(%rdx), %ymm0
+vmovdqu 120(%rdx), %ymm1
+vmovdqu 208(%rdx), %ymm2
+vmovdqu 296(%rdx), %ymm12
+vmovdqu 1088(%rdx), %ymm4
+vmovdqu 1176(%rdx), %ymm5
+vmovdqu 1264(%rdx), %ymm6
+vmovdqu 1352(%rdx), %ymm7
+vmovdqu 384(%rdx), %ymm8
+vmovdqu 472(%rdx), %ymm9
+vmovdqu 560(%rdx), %ymm10
+vmovdqu 648(%rdx), %ymm11
+vmovdqa %ymm0, 32(%r11)
+vmovdqa %ymm1, 128(%r11)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 224(%r11)
+vmovdqa %ymm2, 320(%r11)
+vmovdqa %ymm12, 416(%r11)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 512(%r11)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 608(%r11)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 704(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 800(%r11)
+vmovdqa %ymm4, 5216(%r11)
+vmovdqa %ymm5, 5312(%r11)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5408(%r11)
+vmovdqa %ymm6, 5504(%r11)
+vmovdqa %ymm7, 5600(%r11)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5696(%r11)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5792(%r11)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5888(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5984(%r11)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 736(%rdx), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 824(%rdx), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 912(%rdx), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1000(%rdx), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 896(%r11)
+vmovdqa %ymm9, 992(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1088(%r11)
+vmovdqa %ymm10, 1184(%r11)
+vmovdqa %ymm11, 1280(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1376(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1472(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1568(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1664(%r11)
+vmovdqa %ymm12, 1760(%r11)
+vmovdqa %ymm13, 1856(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1952(%r11)
+vmovdqa %ymm14, 2048(%r11)
+vmovdqa %ymm15, 2144(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2240(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2336(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2432(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2528(%r11)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2624(%r11)
+vmovdqa %ymm9, 2720(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2816(%r11)
+vmovdqa %ymm10, 2912(%r11)
+vmovdqa %ymm11, 3008(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3104(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3200(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3296(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3392(%r11)
+vmovdqa %ymm12, 3488(%r11)
+vmovdqa %ymm13, 3584(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3680(%r11)
+vmovdqa %ymm14, 3776(%r11)
+vmovdqa %ymm15, 3872(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3968(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4064(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4160(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4256(%r11)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4352(%r11)
+vmovdqa %ymm13, 4448(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4544(%r11)
+vmovdqa %ymm14, 4640(%r11)
+vmovdqa %ymm15, 4736(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4832(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4928(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5024(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5120(%r11)
+vmovdqu 64(%rdx), %ymm0
+vmovdqu 152(%rdx), %ymm1
+vmovdqu 240(%rdx), %ymm2
+vmovdqu 328(%rdx), %ymm12
+vmovdqu 1120(%rdx), %ymm4
+vmovdqu 1208(%rdx), %ymm5
+vmovdqu 1296(%rdx), %ymm6
+vmovdqu 1384(%rdx), %ymm7
+vpand mask_low9words(%rip), %ymm7, %ymm7
+vmovdqu 416(%rdx), %ymm8
+vmovdqu 504(%rdx), %ymm9
+vmovdqu 592(%rdx), %ymm10
+vmovdqu 680(%rdx), %ymm11
+vmovdqa %ymm0, 64(%r11)
+vmovdqa %ymm1, 160(%r11)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 256(%r11)
+vmovdqa %ymm2, 352(%r11)
+vmovdqa %ymm12, 448(%r11)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 544(%r11)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 640(%r11)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 736(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 832(%r11)
+vmovdqa %ymm4, 5248(%r11)
+vmovdqa %ymm5, 5344(%r11)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5440(%r11)
+vmovdqa %ymm6, 5536(%r11)
+vmovdqa %ymm7, 5632(%r11)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5728(%r11)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5824(%r11)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5920(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 6016(%r11)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 768(%rdx), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 856(%rdx), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 944(%rdx), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1032(%rdx), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 928(%r11)
+vmovdqa %ymm9, 1024(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1120(%r11)
+vmovdqa %ymm10, 1216(%r11)
+vmovdqa %ymm11, 1312(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1408(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1504(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1600(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1696(%r11)
+vmovdqa %ymm12, 1792(%r11)
+vmovdqa %ymm13, 1888(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1984(%r11)
+vmovdqa %ymm14, 2080(%r11)
+vmovdqa %ymm15, 2176(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2272(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2368(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2464(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2560(%r11)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2656(%r11)
+vmovdqa %ymm9, 2752(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2848(%r11)
+vmovdqa %ymm10, 2944(%r11)
+vmovdqa %ymm11, 3040(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3136(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3232(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3328(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3424(%r11)
+vmovdqa %ymm12, 3520(%r11)
+vmovdqa %ymm13, 3616(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3712(%r11)
+vmovdqa %ymm14, 3808(%r11)
+vmovdqa %ymm15, 3904(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4000(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4096(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4192(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4288(%r11)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4384(%r11)
+vmovdqa %ymm13, 4480(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4576(%r11)
+vmovdqa %ymm14, 4672(%r11)
+vmovdqa %ymm15, 4768(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4864(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4960(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5056(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5152(%r11)
+subq $9408, %rsp
+mov $4, %ecx
+karatsuba_loop_4eced63f144beffcb0247f9c6f67d165:
+mov %rsp, %r9
+mov %rsp, %r10
+subq $32, %rsp
+vmovdqa 0(%rax), %ymm0
+vmovdqa 192(%rax), %ymm1
+vmovdqa 384(%rax), %ymm2
+vmovdqa 576(%rax), %ymm3
+vpunpcklwd 96(%rax), %ymm0, %ymm4
+vpunpckhwd 96(%rax), %ymm0, %ymm5
+vpunpcklwd 288(%rax), %ymm1, %ymm6
+vpunpckhwd 288(%rax), %ymm1, %ymm7
+vpunpcklwd 480(%rax), %ymm2, %ymm8
+vpunpckhwd 480(%rax), %ymm2, %ymm9
+vpunpcklwd 672(%rax), %ymm3, %ymm10
+vpunpckhwd 672(%rax), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 768(%rax), %ymm0
+vmovdqa 960(%rax), %ymm1
+vmovdqa 1152(%rax), %ymm2
+vmovdqa 1344(%rax), %ymm3
+vpunpcklwd 864(%rax), %ymm0, %ymm12
+vpunpckhwd 864(%rax), %ymm0, %ymm13
+vpunpcklwd 1056(%rax), %ymm1, %ymm14
+vpunpckhwd 1056(%rax), %ymm1, %ymm15
+vpunpcklwd 1248(%rax), %ymm2, %ymm0
+vpunpckhwd 1248(%rax), %ymm2, %ymm1
+vpunpcklwd 1440(%rax), %ymm3, %ymm2
+vpunpckhwd 1440(%rax), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 0(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 32(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 64(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 96(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 128(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 160(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 192(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 256(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 288(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 320(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 352(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 384(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 416(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 448(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 224(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 480(%r9)
+vmovdqa 32(%rax), %ymm0
+vmovdqa 224(%rax), %ymm1
+vmovdqa 416(%rax), %ymm2
+vmovdqa 608(%rax), %ymm3
+vpunpcklwd 128(%rax), %ymm0, %ymm4
+vpunpckhwd 128(%rax), %ymm0, %ymm5
+vpunpcklwd 320(%rax), %ymm1, %ymm6
+vpunpckhwd 320(%rax), %ymm1, %ymm7
+vpunpcklwd 512(%rax), %ymm2, %ymm8
+vpunpckhwd 512(%rax), %ymm2, %ymm9
+vpunpcklwd 704(%rax), %ymm3, %ymm10
+vpunpckhwd 704(%rax), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 800(%rax), %ymm0
+vmovdqa 992(%rax), %ymm1
+vmovdqa 1184(%rax), %ymm2
+vmovdqa 1376(%rax), %ymm3
+vpunpcklwd 896(%rax), %ymm0, %ymm12
+vpunpckhwd 896(%rax), %ymm0, %ymm13
+vpunpcklwd 1088(%rax), %ymm1, %ymm14
+vpunpckhwd 1088(%rax), %ymm1, %ymm15
+vpunpcklwd 1280(%rax), %ymm2, %ymm0
+vpunpckhwd 1280(%rax), %ymm2, %ymm1
+vpunpcklwd 1472(%rax), %ymm3, %ymm2
+vpunpckhwd 1472(%rax), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 512(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 544(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 576(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 608(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 640(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 672(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 704(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 768(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 800(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 832(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 864(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 896(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 928(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 960(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 736(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 992(%r9)
+vmovdqa 64(%rax), %ymm0
+vmovdqa 256(%rax), %ymm1
+vmovdqa 448(%rax), %ymm2
+vmovdqa 640(%rax), %ymm3
+vpunpcklwd 160(%rax), %ymm0, %ymm4
+vpunpckhwd 160(%rax), %ymm0, %ymm5
+vpunpcklwd 352(%rax), %ymm1, %ymm6
+vpunpckhwd 352(%rax), %ymm1, %ymm7
+vpunpcklwd 544(%rax), %ymm2, %ymm8
+vpunpckhwd 544(%rax), %ymm2, %ymm9
+vpunpcklwd 736(%rax), %ymm3, %ymm10
+vpunpckhwd 736(%rax), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 832(%rax), %ymm0
+vmovdqa 1024(%rax), %ymm1
+vmovdqa 1216(%rax), %ymm2
+vmovdqa 1408(%rax), %ymm3
+vpunpcklwd 928(%rax), %ymm0, %ymm12
+vpunpckhwd 928(%rax), %ymm0, %ymm13
+vpunpcklwd 1120(%rax), %ymm1, %ymm14
+vpunpckhwd 1120(%rax), %ymm1, %ymm15
+vpunpcklwd 1312(%rax), %ymm2, %ymm0
+vpunpckhwd 1312(%rax), %ymm2, %ymm1
+vpunpcklwd 1504(%rax), %ymm3, %ymm2
+vpunpckhwd 1504(%rax), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 1024(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 1056(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 1088(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 1120(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 1152(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1184(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1216(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1280(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1312(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1344(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 1376(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1248(%r9)
+addq $32, %rsp
+subq $32, %rsp
+vmovdqa 0(%r11), %ymm0
+vmovdqa 192(%r11), %ymm1
+vmovdqa 384(%r11), %ymm2
+vmovdqa 576(%r11), %ymm3
+vpunpcklwd 96(%r11), %ymm0, %ymm4
+vpunpckhwd 96(%r11), %ymm0, %ymm5
+vpunpcklwd 288(%r11), %ymm1, %ymm6
+vpunpckhwd 288(%r11), %ymm1, %ymm7
+vpunpcklwd 480(%r11), %ymm2, %ymm8
+vpunpckhwd 480(%r11), %ymm2, %ymm9
+vpunpcklwd 672(%r11), %ymm3, %ymm10
+vpunpckhwd 672(%r11), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 768(%r11), %ymm0
+vmovdqa 960(%r11), %ymm1
+vmovdqa 1152(%r11), %ymm2
+vmovdqa 1344(%r11), %ymm3
+vpunpcklwd 864(%r11), %ymm0, %ymm12
+vpunpckhwd 864(%r11), %ymm0, %ymm13
+vpunpcklwd 1056(%r11), %ymm1, %ymm14
+vpunpckhwd 1056(%r11), %ymm1, %ymm15
+vpunpcklwd 1248(%r11), %ymm2, %ymm0
+vpunpckhwd 1248(%r11), %ymm2, %ymm1
+vpunpcklwd 1440(%r11), %ymm3, %ymm2
+vpunpckhwd 1440(%r11), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 1408(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 1440(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 1472(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 1504(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 1536(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1568(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1600(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1664(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1696(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1728(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 1760(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 1792(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 1824(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 1856(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1632(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 1888(%r9)
+vmovdqa 32(%r11), %ymm0
+vmovdqa 224(%r11), %ymm1
+vmovdqa 416(%r11), %ymm2
+vmovdqa 608(%r11), %ymm3
+vpunpcklwd 128(%r11), %ymm0, %ymm4
+vpunpckhwd 128(%r11), %ymm0, %ymm5
+vpunpcklwd 320(%r11), %ymm1, %ymm6
+vpunpckhwd 320(%r11), %ymm1, %ymm7
+vpunpcklwd 512(%r11), %ymm2, %ymm8
+vpunpckhwd 512(%r11), %ymm2, %ymm9
+vpunpcklwd 704(%r11), %ymm3, %ymm10
+vpunpckhwd 704(%r11), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 800(%r11), %ymm0
+vmovdqa 992(%r11), %ymm1
+vmovdqa 1184(%r11), %ymm2
+vmovdqa 1376(%r11), %ymm3
+vpunpcklwd 896(%r11), %ymm0, %ymm12
+vpunpckhwd 896(%r11), %ymm0, %ymm13
+vpunpcklwd 1088(%r11), %ymm1, %ymm14
+vpunpckhwd 1088(%r11), %ymm1, %ymm15
+vpunpcklwd 1280(%r11), %ymm2, %ymm0
+vpunpckhwd 1280(%r11), %ymm2, %ymm1
+vpunpcklwd 1472(%r11), %ymm3, %ymm2
+vpunpckhwd 1472(%r11), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 1920(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 1952(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 1984(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 2016(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 2048(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 2080(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 2112(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 2176(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 2208(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2240(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2272(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2304(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2336(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2368(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 2144(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2400(%r9)
+vmovdqa 64(%r11), %ymm0
+vmovdqa 256(%r11), %ymm1
+vmovdqa 448(%r11), %ymm2
+vmovdqa 640(%r11), %ymm3
+vpunpcklwd 160(%r11), %ymm0, %ymm4
+vpunpckhwd 160(%r11), %ymm0, %ymm5
+vpunpcklwd 352(%r11), %ymm1, %ymm6
+vpunpckhwd 352(%r11), %ymm1, %ymm7
+vpunpcklwd 544(%r11), %ymm2, %ymm8
+vpunpckhwd 544(%r11), %ymm2, %ymm9
+vpunpcklwd 736(%r11), %ymm3, %ymm10
+vpunpckhwd 736(%r11), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 832(%r11), %ymm0
+vmovdqa 1024(%r11), %ymm1
+vmovdqa 1216(%r11), %ymm2
+vmovdqa 1408(%r11), %ymm3
+vpunpcklwd 928(%r11), %ymm0, %ymm12
+vpunpckhwd 928(%r11), %ymm0, %ymm13
+vpunpcklwd 1120(%r11), %ymm1, %ymm14
+vpunpckhwd 1120(%r11), %ymm1, %ymm15
+vpunpcklwd 1312(%r11), %ymm2, %ymm0
+vpunpckhwd 1312(%r11), %ymm2, %ymm1
+vpunpcklwd 1504(%r11), %ymm3, %ymm2
+vpunpckhwd 1504(%r11), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 2432(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 2464(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 2496(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 2528(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 2560(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 2592(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 2624(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 2688(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 2720(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2752(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2784(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 2656(%r9)
+addq $32, %rsp
+innerloop_4eced63f144beffcb0247f9c6f67d165:
+vmovdqa 0(%r9), %ymm0
+vmovdqa 1408(%r9), %ymm6
+vmovdqa 32(%r9), %ymm1
+vmovdqa 1440(%r9), %ymm7
+vmovdqa 64(%r9), %ymm2
+vmovdqa 1472(%r9), %ymm8
+vmovdqa 96(%r9), %ymm3
+vmovdqa 1504(%r9), %ymm9
+vmovdqa 128(%r9), %ymm4
+vmovdqa 1536(%r9), %ymm10
+vmovdqa 160(%r9), %ymm5
+vmovdqa 1568(%r9), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 2816(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 2848(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 2880(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 2912(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 2944(%r10)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 2976(%r10)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3008(%r10)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3040(%r10)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3072(%r10)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3104(%r10)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 3136(%r10)
+vmovdqa 192(%r9), %ymm0
+vmovdqa 1600(%r9), %ymm6
+vmovdqa 224(%r9), %ymm1
+vmovdqa 1632(%r9), %ymm7
+vmovdqa 256(%r9), %ymm2
+vmovdqa 1664(%r9), %ymm8
+vmovdqa 288(%r9), %ymm3
+vmovdqa 1696(%r9), %ymm9
+vmovdqa 320(%r9), %ymm4
+vmovdqa 1728(%r9), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 3200(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3232(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3264(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3296(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3328(%r10)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3360(%r10)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3392(%r10)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3424(%r10)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 3456(%r10)
+vpaddw 0(%r9), %ymm0, %ymm0
+vpaddw 1408(%r9), %ymm6, %ymm6
+vpaddw 32(%r9), %ymm1, %ymm1
+vpaddw 1440(%r9), %ymm7, %ymm7
+vpaddw 64(%r9), %ymm2, %ymm2
+vpaddw 1472(%r9), %ymm8, %ymm8
+vpaddw 96(%r9), %ymm3, %ymm3
+vpaddw 1504(%r9), %ymm9, %ymm9
+vpaddw 128(%r9), %ymm4, %ymm4
+vpaddw 1536(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 2976(%r10), %ymm12, %ymm12
+vpsubw 3360(%r10), %ymm12, %ymm12
+vmovdqa %ymm12, 3168(%r10)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 3008(%r10), %ymm0
+vpsubw 3200(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 3392(%r10), %ymm6, %ymm6
+vmovdqa %ymm6, 3200(%r10)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 2816(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 3008(%r10)
+vmovdqa 3040(%r10), %ymm1
+vpsubw 3232(%r10), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 3424(%r10), %ymm7, %ymm7
+vmovdqa %ymm7, 3232(%r10)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 2848(%r10), %ymm1, %ymm1
+vmovdqa %ymm1, 3040(%r10)
+vmovdqa 3072(%r10), %ymm2
+vpsubw 3264(%r10), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 3456(%r10), %ymm8, %ymm8
+vmovdqa %ymm8, 3264(%r10)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 2880(%r10), %ymm2, %ymm2
+vmovdqa %ymm2, 3072(%r10)
+vmovdqa 3104(%r10), %ymm3
+vpsubw 3296(%r10), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 3296(%r10)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 2912(%r10), %ymm3, %ymm3
+vmovdqa %ymm3, 3104(%r10)
+vmovdqa 3136(%r10), %ymm4
+vpsubw 3328(%r10), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 2944(%r10), %ymm4, %ymm4
+vmovdqa %ymm4, 3136(%r10)
+vmovdqa 352(%r9), %ymm0
+vmovdqa 1760(%r9), %ymm6
+vmovdqa 384(%r9), %ymm1
+vmovdqa 1792(%r9), %ymm7
+vmovdqa 416(%r9), %ymm2
+vmovdqa 1824(%r9), %ymm8
+vmovdqa 448(%r9), %ymm3
+vmovdqa 1856(%r9), %ymm9
+vmovdqa 480(%r9), %ymm4
+vmovdqa 1888(%r9), %ymm10
+vmovdqa 512(%r9), %ymm5
+vmovdqa 1920(%r9), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 3520(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3552(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3584(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3616(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3648(%r10)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3680(%r10)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3712(%r10)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3744(%r10)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3776(%r10)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3808(%r10)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 3840(%r10)
+vmovdqa 544(%r9), %ymm0
+vmovdqa 1952(%r9), %ymm6
+vmovdqa 576(%r9), %ymm1
+vmovdqa 1984(%r9), %ymm7
+vmovdqa 608(%r9), %ymm2
+vmovdqa 2016(%r9), %ymm8
+vmovdqa 640(%r9), %ymm3
+vmovdqa 2048(%r9), %ymm9
+vmovdqa 672(%r9), %ymm4
+vmovdqa 2080(%r9), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 3904(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3936(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3968(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 4000(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 4032(%r10)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 4064(%r10)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 4096(%r10)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 4128(%r10)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 4160(%r10)
+vpaddw 352(%r9), %ymm0, %ymm0
+vpaddw 1760(%r9), %ymm6, %ymm6
+vpaddw 384(%r9), %ymm1, %ymm1
+vpaddw 1792(%r9), %ymm7, %ymm7
+vpaddw 416(%r9), %ymm2, %ymm2
+vpaddw 1824(%r9), %ymm8, %ymm8
+vpaddw 448(%r9), %ymm3, %ymm3
+vpaddw 1856(%r9), %ymm9, %ymm9
+vpaddw 480(%r9), %ymm4, %ymm4
+vpaddw 1888(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 3680(%r10), %ymm12, %ymm12
+vpsubw 4064(%r10), %ymm12, %ymm12
+vmovdqa %ymm12, 3872(%r10)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 3712(%r10), %ymm0
+vpsubw 3904(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 4096(%r10), %ymm6, %ymm6
+vmovdqa %ymm6, 3904(%r10)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 3520(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 3712(%r10)
+vmovdqa 3744(%r10), %ymm1
+vpsubw 3936(%r10), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 4128(%r10), %ymm7, %ymm7
+vmovdqa %ymm7, 3936(%r10)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 3552(%r10), %ymm1, %ymm1
+vmovdqa %ymm1, 3744(%r10)
+vmovdqa 3776(%r10), %ymm2
+vpsubw 3968(%r10), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 4160(%r10), %ymm8, %ymm8
+vmovdqa %ymm8, 3968(%r10)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 3584(%r10), %ymm2, %ymm2
+vmovdqa %ymm2, 3776(%r10)
+vmovdqa 3808(%r10), %ymm3
+vpsubw 4000(%r10), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 4000(%r10)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 3616(%r10), %ymm3, %ymm3
+vmovdqa %ymm3, 3808(%r10)
+vmovdqa 3840(%r10), %ymm4
+vpsubw 4032(%r10), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 3648(%r10), %ymm4, %ymm4
+vmovdqa %ymm4, 3840(%r10)
+vmovdqa 0(%r9), %ymm0
+vmovdqa 1408(%r9), %ymm6
+vpaddw 352(%r9), %ymm0, %ymm0
+vpaddw 1760(%r9), %ymm6, %ymm6
+vmovdqa 32(%r9), %ymm1
+vmovdqa 1440(%r9), %ymm7
+vpaddw 384(%r9), %ymm1, %ymm1
+vpaddw 1792(%r9), %ymm7, %ymm7
+vmovdqa 64(%r9), %ymm2
+vmovdqa 1472(%r9), %ymm8
+vpaddw 416(%r9), %ymm2, %ymm2
+vpaddw 1824(%r9), %ymm8, %ymm8
+vmovdqa 96(%r9), %ymm3
+vmovdqa 1504(%r9), %ymm9
+vpaddw 448(%r9), %ymm3, %ymm3
+vpaddw 1856(%r9), %ymm9, %ymm9
+vmovdqa 128(%r9), %ymm4
+vmovdqa 1536(%r9), %ymm10
+vpaddw 480(%r9), %ymm4, %ymm4
+vpaddw 1888(%r9), %ymm10, %ymm10
+vmovdqa 160(%r9), %ymm5
+vmovdqa 1568(%r9), %ymm11
+vpaddw 512(%r9), %ymm5, %ymm5
+vpaddw 1920(%r9), %ymm11, %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 5888(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5920(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 5952(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5984(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6016(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6048(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6080(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6112(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6144(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6176(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 6208(%rsp)
+vmovdqa 192(%r9), %ymm0
+vmovdqa 1600(%r9), %ymm6
+vpaddw 544(%r9), %ymm0, %ymm0
+vpaddw 1952(%r9), %ymm6, %ymm6
+vmovdqa 224(%r9), %ymm1
+vmovdqa 1632(%r9), %ymm7
+vpaddw 576(%r9), %ymm1, %ymm1
+vpaddw 1984(%r9), %ymm7, %ymm7
+vmovdqa 256(%r9), %ymm2
+vmovdqa 1664(%r9), %ymm8
+vpaddw 608(%r9), %ymm2, %ymm2
+vpaddw 2016(%r9), %ymm8, %ymm8
+vmovdqa 288(%r9), %ymm3
+vmovdqa 1696(%r9), %ymm9
+vpaddw 640(%r9), %ymm3, %ymm3
+vpaddw 2048(%r9), %ymm9, %ymm9
+vmovdqa 320(%r9), %ymm4
+vmovdqa 1728(%r9), %ymm10
+vpaddw 672(%r9), %ymm4, %ymm4
+vpaddw 2080(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 6272(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6304(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6336(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6368(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6400(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6432(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6464(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6496(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 6528(%rsp)
+vpaddw 0(%r9), %ymm0, %ymm0
+vpaddw 1408(%r9), %ymm6, %ymm6
+vpaddw 352(%r9), %ymm0, %ymm0
+vpaddw 1760(%r9), %ymm6, %ymm6
+vpaddw 32(%r9), %ymm1, %ymm1
+vpaddw 1440(%r9), %ymm7, %ymm7
+vpaddw 384(%r9), %ymm1, %ymm1
+vpaddw 1792(%r9), %ymm7, %ymm7
+vpaddw 64(%r9), %ymm2, %ymm2
+vpaddw 1472(%r9), %ymm8, %ymm8
+vpaddw 416(%r9), %ymm2, %ymm2
+vpaddw 1824(%r9), %ymm8, %ymm8
+vpaddw 96(%r9), %ymm3, %ymm3
+vpaddw 1504(%r9), %ymm9, %ymm9
+vpaddw 448(%r9), %ymm3, %ymm3
+vpaddw 1856(%r9), %ymm9, %ymm9
+vpaddw 128(%r9), %ymm4, %ymm4
+vpaddw 1536(%r9), %ymm10, %ymm10
+vpaddw 480(%r9), %ymm4, %ymm4
+vpaddw 1888(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 6048(%rsp), %ymm12, %ymm12
+vpsubw 6432(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 6240(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 6080(%rsp), %ymm0
+vpsubw 6272(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 6464(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 6272(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 6080(%rsp)
+vmovdqa 6112(%rsp), %ymm1
+vpsubw 6304(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 6496(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 6304(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 5920(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 6112(%rsp)
+vmovdqa 6144(%rsp), %ymm2
+vpsubw 6336(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 6528(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 6336(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 5952(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 6144(%rsp)
+vmovdqa 6176(%rsp), %ymm3
+vpsubw 6368(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 6368(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 5984(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 6176(%rsp)
+vmovdqa 6208(%rsp), %ymm4
+vpsubw 6400(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 6016(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 6208(%rsp)
+vmovdqa 6208(%rsp), %ymm0
+vpsubw 3136(%r10), %ymm0, %ymm0
+vpsubw 3840(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 3488(%r10)
+vmovdqa 3168(%r10), %ymm0
+vpsubw 3520(%r10), %ymm0, %ymm0
+vmovdqa 6240(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3872(%r10), %ymm1, %ymm1
+vpsubw 2816(%r10), %ymm0, %ymm0
+vpaddw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3168(%r10)
+vmovdqa %ymm1, 3520(%r10)
+vmovdqa 3200(%r10), %ymm0
+vpsubw 3552(%r10), %ymm0, %ymm0
+vmovdqa 6272(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3904(%r10), %ymm1, %ymm1
+vpsubw 2848(%r10), %ymm0, %ymm0
+vpaddw 5920(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3200(%r10)
+vmovdqa %ymm1, 3552(%r10)
+vmovdqa 3232(%r10), %ymm0
+vpsubw 3584(%r10), %ymm0, %ymm0
+vmovdqa 6304(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3936(%r10), %ymm1, %ymm1
+vpsubw 2880(%r10), %ymm0, %ymm0
+vpaddw 5952(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3232(%r10)
+vmovdqa %ymm1, 3584(%r10)
+vmovdqa 3264(%r10), %ymm0
+vpsubw 3616(%r10), %ymm0, %ymm0
+vmovdqa 6336(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3968(%r10), %ymm1, %ymm1
+vpsubw 2912(%r10), %ymm0, %ymm0
+vpaddw 5984(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3264(%r10)
+vmovdqa %ymm1, 3616(%r10)
+vmovdqa 3296(%r10), %ymm0
+vpsubw 3648(%r10), %ymm0, %ymm0
+vmovdqa 6368(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4000(%r10), %ymm1, %ymm1
+vpsubw 2944(%r10), %ymm0, %ymm0
+vpaddw 6016(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3296(%r10)
+vmovdqa %ymm1, 3648(%r10)
+vmovdqa 3328(%r10), %ymm0
+vpsubw 3680(%r10), %ymm0, %ymm0
+vmovdqa 6400(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4032(%r10), %ymm1, %ymm1
+vpsubw 2976(%r10), %ymm0, %ymm0
+vpaddw 6048(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3328(%r10)
+vmovdqa %ymm1, 3680(%r10)
+vmovdqa 3360(%r10), %ymm0
+vpsubw 3712(%r10), %ymm0, %ymm0
+vmovdqa 6432(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4064(%r10), %ymm1, %ymm1
+vpsubw 3008(%r10), %ymm0, %ymm0
+vpaddw 6080(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3360(%r10)
+vmovdqa %ymm1, 3712(%r10)
+vmovdqa 3392(%r10), %ymm0
+vpsubw 3744(%r10), %ymm0, %ymm0
+vmovdqa 6464(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4096(%r10), %ymm1, %ymm1
+vpsubw 3040(%r10), %ymm0, %ymm0
+vpaddw 6112(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3392(%r10)
+vmovdqa %ymm1, 3744(%r10)
+vmovdqa 3424(%r10), %ymm0
+vpsubw 3776(%r10), %ymm0, %ymm0
+vmovdqa 6496(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4128(%r10), %ymm1, %ymm1
+vpsubw 3072(%r10), %ymm0, %ymm0
+vpaddw 6144(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3424(%r10)
+vmovdqa %ymm1, 3776(%r10)
+vmovdqa 3456(%r10), %ymm0
+vpsubw 3808(%r10), %ymm0, %ymm0
+vmovdqa 6528(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4160(%r10), %ymm1, %ymm1
+vpsubw 3104(%r10), %ymm0, %ymm0
+vpaddw 6176(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3456(%r10)
+vmovdqa %ymm1, 3808(%r10)
+neg %ecx
+jns done_4eced63f144beffcb0247f9c6f67d165
+add $704, %r9
+add $1408, %r10
+jmp innerloop_4eced63f144beffcb0247f9c6f67d165
+done_4eced63f144beffcb0247f9c6f67d165:
+sub $704, %r9
+sub $1408, %r10
+vmovdqa 0(%r9), %ymm0
+vpaddw 704(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6592(%rsp)
+vmovdqa 1408(%r9), %ymm0
+vpaddw 2112(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7296(%rsp)
+vmovdqa 32(%r9), %ymm0
+vpaddw 736(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6624(%rsp)
+vmovdqa 1440(%r9), %ymm0
+vpaddw 2144(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7328(%rsp)
+vmovdqa 64(%r9), %ymm0
+vpaddw 768(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6656(%rsp)
+vmovdqa 1472(%r9), %ymm0
+vpaddw 2176(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7360(%rsp)
+vmovdqa 96(%r9), %ymm0
+vpaddw 800(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6688(%rsp)
+vmovdqa 1504(%r9), %ymm0
+vpaddw 2208(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7392(%rsp)
+vmovdqa 128(%r9), %ymm0
+vpaddw 832(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6720(%rsp)
+vmovdqa 1536(%r9), %ymm0
+vpaddw 2240(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7424(%rsp)
+vmovdqa 160(%r9), %ymm0
+vpaddw 864(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6752(%rsp)
+vmovdqa 1568(%r9), %ymm0
+vpaddw 2272(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7456(%rsp)
+vmovdqa 192(%r9), %ymm0
+vpaddw 896(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6784(%rsp)
+vmovdqa 1600(%r9), %ymm0
+vpaddw 2304(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7488(%rsp)
+vmovdqa 224(%r9), %ymm0
+vpaddw 928(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6816(%rsp)
+vmovdqa 1632(%r9), %ymm0
+vpaddw 2336(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7520(%rsp)
+vmovdqa 256(%r9), %ymm0
+vpaddw 960(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6848(%rsp)
+vmovdqa 1664(%r9), %ymm0
+vpaddw 2368(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7552(%rsp)
+vmovdqa 288(%r9), %ymm0
+vpaddw 992(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6880(%rsp)
+vmovdqa 1696(%r9), %ymm0
+vpaddw 2400(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7584(%rsp)
+vmovdqa 320(%r9), %ymm0
+vpaddw 1024(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6912(%rsp)
+vmovdqa 1728(%r9), %ymm0
+vpaddw 2432(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7616(%rsp)
+vmovdqa 352(%r9), %ymm0
+vpaddw 1056(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6944(%rsp)
+vmovdqa 1760(%r9), %ymm0
+vpaddw 2464(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7648(%rsp)
+vmovdqa 384(%r9), %ymm0
+vpaddw 1088(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6976(%rsp)
+vmovdqa 1792(%r9), %ymm0
+vpaddw 2496(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7680(%rsp)
+vmovdqa 416(%r9), %ymm0
+vpaddw 1120(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7008(%rsp)
+vmovdqa 1824(%r9), %ymm0
+vpaddw 2528(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7712(%rsp)
+vmovdqa 448(%r9), %ymm0
+vpaddw 1152(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7040(%rsp)
+vmovdqa 1856(%r9), %ymm0
+vpaddw 2560(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7744(%rsp)
+vmovdqa 480(%r9), %ymm0
+vpaddw 1184(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7072(%rsp)
+vmovdqa 1888(%r9), %ymm0
+vpaddw 2592(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7776(%rsp)
+vmovdqa 512(%r9), %ymm0
+vpaddw 1216(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7104(%rsp)
+vmovdqa 1920(%r9), %ymm0
+vpaddw 2624(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7808(%rsp)
+vmovdqa 544(%r9), %ymm0
+vpaddw 1248(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7136(%rsp)
+vmovdqa 1952(%r9), %ymm0
+vpaddw 2656(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7840(%rsp)
+vmovdqa 576(%r9), %ymm0
+vpaddw 1280(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7168(%rsp)
+vmovdqa 1984(%r9), %ymm0
+vpaddw 2688(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7872(%rsp)
+vmovdqa 608(%r9), %ymm0
+vpaddw 1312(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7200(%rsp)
+vmovdqa 2016(%r9), %ymm0
+vpaddw 2720(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7904(%rsp)
+vmovdqa 640(%r9), %ymm0
+vpaddw 1344(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7232(%rsp)
+vmovdqa 2048(%r9), %ymm0
+vpaddw 2752(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7936(%rsp)
+vmovdqa 672(%r9), %ymm0
+vpaddw 1376(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7264(%rsp)
+vmovdqa 2080(%r9), %ymm0
+vpaddw 2784(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7968(%rsp)
+vmovdqa 6592(%rsp), %ymm0
+vmovdqa 7296(%rsp), %ymm6
+vmovdqa 6624(%rsp), %ymm1
+vmovdqa 7328(%rsp), %ymm7
+vmovdqa 6656(%rsp), %ymm2
+vmovdqa 7360(%rsp), %ymm8
+vmovdqa 6688(%rsp), %ymm3
+vmovdqa 7392(%rsp), %ymm9
+vmovdqa 6720(%rsp), %ymm4
+vmovdqa 7424(%rsp), %ymm10
+vmovdqa 6752(%rsp), %ymm5
+vmovdqa 7456(%rsp), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 8000(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8032(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8064(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8096(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8128(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8160(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8192(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8224(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8256(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8288(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 8320(%rsp)
+vmovdqa 6784(%rsp), %ymm0
+vmovdqa 7488(%rsp), %ymm6
+vmovdqa 6816(%rsp), %ymm1
+vmovdqa 7520(%rsp), %ymm7
+vmovdqa 6848(%rsp), %ymm2
+vmovdqa 7552(%rsp), %ymm8
+vmovdqa 6880(%rsp), %ymm3
+vmovdqa 7584(%rsp), %ymm9
+vmovdqa 6912(%rsp), %ymm4
+vmovdqa 7616(%rsp), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 8384(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8416(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8448(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8480(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8512(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8544(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8576(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8608(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 8640(%rsp)
+vpaddw 6592(%rsp), %ymm0, %ymm0
+vpaddw 7296(%rsp), %ymm6, %ymm6
+vpaddw 6624(%rsp), %ymm1, %ymm1
+vpaddw 7328(%rsp), %ymm7, %ymm7
+vpaddw 6656(%rsp), %ymm2, %ymm2
+vpaddw 7360(%rsp), %ymm8, %ymm8
+vpaddw 6688(%rsp), %ymm3, %ymm3
+vpaddw 7392(%rsp), %ymm9, %ymm9
+vpaddw 6720(%rsp), %ymm4, %ymm4
+vpaddw 7424(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 8160(%rsp), %ymm12, %ymm12
+vpsubw 8544(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 8352(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 8192(%rsp), %ymm0
+vpsubw 8384(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 8576(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 8384(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 8000(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8192(%rsp)
+vmovdqa 8224(%rsp), %ymm1
+vpsubw 8416(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 8608(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 8416(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 8032(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 8224(%rsp)
+vmovdqa 8256(%rsp), %ymm2
+vpsubw 8448(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 8640(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 8448(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 8064(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 8256(%rsp)
+vmovdqa 8288(%rsp), %ymm3
+vpsubw 8480(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 8480(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 8096(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 8288(%rsp)
+vmovdqa 8320(%rsp), %ymm4
+vpsubw 8512(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 8128(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 8320(%rsp)
+vmovdqa 6944(%rsp), %ymm0
+vmovdqa 7648(%rsp), %ymm6
+vmovdqa 6976(%rsp), %ymm1
+vmovdqa 7680(%rsp), %ymm7
+vmovdqa 7008(%rsp), %ymm2
+vmovdqa 7712(%rsp), %ymm8
+vmovdqa 7040(%rsp), %ymm3
+vmovdqa 7744(%rsp), %ymm9
+vmovdqa 7072(%rsp), %ymm4
+vmovdqa 7776(%rsp), %ymm10
+vmovdqa 7104(%rsp), %ymm5
+vmovdqa 7808(%rsp), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 8704(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8736(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8768(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8800(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8832(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8864(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8896(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8928(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8960(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8992(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 9024(%rsp)
+vmovdqa 7136(%rsp), %ymm0
+vmovdqa 7840(%rsp), %ymm6
+vmovdqa 7168(%rsp), %ymm1
+vmovdqa 7872(%rsp), %ymm7
+vmovdqa 7200(%rsp), %ymm2
+vmovdqa 7904(%rsp), %ymm8
+vmovdqa 7232(%rsp), %ymm3
+vmovdqa 7936(%rsp), %ymm9
+vmovdqa 7264(%rsp), %ymm4
+vmovdqa 7968(%rsp), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 9088(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9120(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 9152(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9184(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 9216(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9248(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 9280(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9312(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 9344(%rsp)
+vpaddw 6944(%rsp), %ymm0, %ymm0
+vpaddw 7648(%rsp), %ymm6, %ymm6
+vpaddw 6976(%rsp), %ymm1, %ymm1
+vpaddw 7680(%rsp), %ymm7, %ymm7
+vpaddw 7008(%rsp), %ymm2, %ymm2
+vpaddw 7712(%rsp), %ymm8, %ymm8
+vpaddw 7040(%rsp), %ymm3, %ymm3
+vpaddw 7744(%rsp), %ymm9, %ymm9
+vpaddw 7072(%rsp), %ymm4, %ymm4
+vpaddw 7776(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 8864(%rsp), %ymm12, %ymm12
+vpsubw 9248(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 9056(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 8896(%rsp), %ymm0
+vpsubw 9088(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 9280(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 9088(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 8704(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8896(%rsp)
+vmovdqa 8928(%rsp), %ymm1
+vpsubw 9120(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 9312(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 9120(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 8736(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 8928(%rsp)
+vmovdqa 8960(%rsp), %ymm2
+vpsubw 9152(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 9344(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 9152(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 8768(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 8960(%rsp)
+vmovdqa 8992(%rsp), %ymm3
+vpsubw 9184(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 9184(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 8800(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 8992(%rsp)
+vmovdqa 9024(%rsp), %ymm4
+vpsubw 9216(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 8832(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 9024(%rsp)
+vmovdqa 6592(%rsp), %ymm0
+vmovdqa 7296(%rsp), %ymm6
+vpaddw 6944(%rsp), %ymm0, %ymm0
+vpaddw 7648(%rsp), %ymm6, %ymm6
+vmovdqa 6624(%rsp), %ymm1
+vmovdqa 7328(%rsp), %ymm7
+vpaddw 6976(%rsp), %ymm1, %ymm1
+vpaddw 7680(%rsp), %ymm7, %ymm7
+vmovdqa 6656(%rsp), %ymm2
+vmovdqa 7360(%rsp), %ymm8
+vpaddw 7008(%rsp), %ymm2, %ymm2
+vpaddw 7712(%rsp), %ymm8, %ymm8
+vmovdqa 6688(%rsp), %ymm3
+vmovdqa 7392(%rsp), %ymm9
+vpaddw 7040(%rsp), %ymm3, %ymm3
+vpaddw 7744(%rsp), %ymm9, %ymm9
+vmovdqa 6720(%rsp), %ymm4
+vmovdqa 7424(%rsp), %ymm10
+vpaddw 7072(%rsp), %ymm4, %ymm4
+vpaddw 7776(%rsp), %ymm10, %ymm10
+vmovdqa 6752(%rsp), %ymm5
+vmovdqa 7456(%rsp), %ymm11
+vpaddw 7104(%rsp), %ymm5, %ymm5
+vpaddw 7808(%rsp), %ymm11, %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 5888(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5920(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 5952(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5984(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6016(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6048(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6080(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6112(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6144(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6176(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 6208(%rsp)
+vmovdqa 6784(%rsp), %ymm0
+vmovdqa 7488(%rsp), %ymm6
+vpaddw 7136(%rsp), %ymm0, %ymm0
+vpaddw 7840(%rsp), %ymm6, %ymm6
+vmovdqa 6816(%rsp), %ymm1
+vmovdqa 7520(%rsp), %ymm7
+vpaddw 7168(%rsp), %ymm1, %ymm1
+vpaddw 7872(%rsp), %ymm7, %ymm7
+vmovdqa 6848(%rsp), %ymm2
+vmovdqa 7552(%rsp), %ymm8
+vpaddw 7200(%rsp), %ymm2, %ymm2
+vpaddw 7904(%rsp), %ymm8, %ymm8
+vmovdqa 6880(%rsp), %ymm3
+vmovdqa 7584(%rsp), %ymm9
+vpaddw 7232(%rsp), %ymm3, %ymm3
+vpaddw 7936(%rsp), %ymm9, %ymm9
+vmovdqa 6912(%rsp), %ymm4
+vmovdqa 7616(%rsp), %ymm10
+vpaddw 7264(%rsp), %ymm4, %ymm4
+vpaddw 7968(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 6272(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6304(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6336(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6368(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6400(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6432(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6464(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6496(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 6528(%rsp)
+vpaddw 6592(%rsp), %ymm0, %ymm0
+vpaddw 7296(%rsp), %ymm6, %ymm6
+vpaddw 6944(%rsp), %ymm0, %ymm0
+vpaddw 7648(%rsp), %ymm6, %ymm6
+vpaddw 6624(%rsp), %ymm1, %ymm1
+vpaddw 7328(%rsp), %ymm7, %ymm7
+vpaddw 6976(%rsp), %ymm1, %ymm1
+vpaddw 7680(%rsp), %ymm7, %ymm7
+vpaddw 6656(%rsp), %ymm2, %ymm2
+vpaddw 7360(%rsp), %ymm8, %ymm8
+vpaddw 7008(%rsp), %ymm2, %ymm2
+vpaddw 7712(%rsp), %ymm8, %ymm8
+vpaddw 6688(%rsp), %ymm3, %ymm3
+vpaddw 7392(%rsp), %ymm9, %ymm9
+vpaddw 7040(%rsp), %ymm3, %ymm3
+vpaddw 7744(%rsp), %ymm9, %ymm9
+vpaddw 6720(%rsp), %ymm4, %ymm4
+vpaddw 7424(%rsp), %ymm10, %ymm10
+vpaddw 7072(%rsp), %ymm4, %ymm4
+vpaddw 7776(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 6048(%rsp), %ymm12, %ymm12
+vpsubw 6432(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 6240(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 6080(%rsp), %ymm0
+vpsubw 6272(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 6464(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 6272(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 6080(%rsp)
+vmovdqa 6112(%rsp), %ymm1
+vpsubw 6304(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 6496(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 6304(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 5920(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 6112(%rsp)
+vmovdqa 6144(%rsp), %ymm2
+vpsubw 6336(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 6528(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 6336(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 5952(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 6144(%rsp)
+vmovdqa 6176(%rsp), %ymm3
+vpsubw 6368(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 6368(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 5984(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 6176(%rsp)
+vmovdqa 6208(%rsp), %ymm4
+vpsubw 6400(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 6016(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 6208(%rsp)
+vmovdqa 8352(%rsp), %ymm0
+vpsubw 8704(%rsp), %ymm0, %ymm0
+vmovdqa 6240(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9056(%rsp), %ymm1, %ymm6
+vpsubw 8000(%rsp), %ymm0, %ymm0
+vpaddw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8352(%rsp)
+vmovdqa 8384(%rsp), %ymm0
+vpsubw 8736(%rsp), %ymm0, %ymm0
+vmovdqa 6272(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9088(%rsp), %ymm1, %ymm7
+vpsubw 8032(%rsp), %ymm0, %ymm0
+vpaddw 5920(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8384(%rsp)
+vmovdqa 8416(%rsp), %ymm0
+vpsubw 8768(%rsp), %ymm0, %ymm0
+vmovdqa 6304(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9120(%rsp), %ymm1, %ymm8
+vpsubw 8064(%rsp), %ymm0, %ymm0
+vpaddw 5952(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8416(%rsp)
+vmovdqa 8448(%rsp), %ymm0
+vpsubw 8800(%rsp), %ymm0, %ymm0
+vmovdqa 6336(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9152(%rsp), %ymm1, %ymm9
+vpsubw 8096(%rsp), %ymm0, %ymm0
+vpaddw 5984(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8448(%rsp)
+vmovdqa 8480(%rsp), %ymm0
+vpsubw 8832(%rsp), %ymm0, %ymm0
+vmovdqa 6368(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9184(%rsp), %ymm1, %ymm10
+vpsubw 8128(%rsp), %ymm0, %ymm0
+vpaddw 6016(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8480(%rsp)
+vmovdqa 8512(%rsp), %ymm0
+vpsubw 8864(%rsp), %ymm0, %ymm0
+vmovdqa 6400(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9216(%rsp), %ymm1, %ymm11
+vpsubw 8160(%rsp), %ymm0, %ymm0
+vpaddw 6048(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8512(%rsp)
+vmovdqa 8544(%rsp), %ymm0
+vpsubw 8896(%rsp), %ymm0, %ymm0
+vmovdqa 6432(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9248(%rsp), %ymm1, %ymm12
+vpsubw 8192(%rsp), %ymm0, %ymm0
+vpaddw 6080(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8544(%rsp)
+vmovdqa 8576(%rsp), %ymm0
+vpsubw 8928(%rsp), %ymm0, %ymm0
+vmovdqa 6464(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9280(%rsp), %ymm1, %ymm13
+vpsubw 8224(%rsp), %ymm0, %ymm0
+vpaddw 6112(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8576(%rsp)
+vmovdqa 8608(%rsp), %ymm0
+vpsubw 8960(%rsp), %ymm0, %ymm0
+vmovdqa 6496(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9312(%rsp), %ymm1, %ymm14
+vpsubw 8256(%rsp), %ymm0, %ymm0
+vpaddw 6144(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8608(%rsp)
+vmovdqa 8640(%rsp), %ymm0
+vpsubw 8992(%rsp), %ymm0, %ymm0
+vmovdqa 6528(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9344(%rsp), %ymm1, %ymm15
+vpsubw 8288(%rsp), %ymm0, %ymm0
+vpaddw 6176(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8640(%rsp)
+vmovdqa 6208(%rsp), %ymm0
+vpsubw 8320(%rsp), %ymm0, %ymm0
+vpsubw 9024(%rsp), %ymm0, %ymm0
+vpsubw 3488(%r10), %ymm0, %ymm0
+vpsubw 4896(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 4192(%r10)
+vmovdqa 3520(%r10), %ymm0
+vpsubw 4224(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm6, %ymm6
+vpsubw 4928(%r10), %ymm6, %ymm6
+vpsubw 2816(%r10), %ymm0, %ymm0
+vpaddw 8000(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3520(%r10)
+vmovdqa %ymm6, 4224(%r10)
+vmovdqa 3552(%r10), %ymm0
+vpsubw 4256(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm7, %ymm7
+vpsubw 4960(%r10), %ymm7, %ymm7
+vpsubw 2848(%r10), %ymm0, %ymm0
+vpaddw 8032(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3552(%r10)
+vmovdqa %ymm7, 4256(%r10)
+vmovdqa 3584(%r10), %ymm0
+vpsubw 4288(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm8, %ymm8
+vpsubw 4992(%r10), %ymm8, %ymm8
+vpsubw 2880(%r10), %ymm0, %ymm0
+vpaddw 8064(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3584(%r10)
+vmovdqa %ymm8, 4288(%r10)
+vmovdqa 3616(%r10), %ymm0
+vpsubw 4320(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm9, %ymm9
+vpsubw 5024(%r10), %ymm9, %ymm9
+vpsubw 2912(%r10), %ymm0, %ymm0
+vpaddw 8096(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3616(%r10)
+vmovdqa %ymm9, 4320(%r10)
+vmovdqa 3648(%r10), %ymm0
+vpsubw 4352(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm10, %ymm10
+vpsubw 5056(%r10), %ymm10, %ymm10
+vpsubw 2944(%r10), %ymm0, %ymm0
+vpaddw 8128(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3648(%r10)
+vmovdqa %ymm10, 4352(%r10)
+vmovdqa 3680(%r10), %ymm0
+vpsubw 4384(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm11, %ymm11
+vpsubw 5088(%r10), %ymm11, %ymm11
+vpsubw 2976(%r10), %ymm0, %ymm0
+vpaddw 8160(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3680(%r10)
+vmovdqa %ymm11, 4384(%r10)
+vmovdqa 3712(%r10), %ymm0
+vpsubw 4416(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm12
+vpsubw 5120(%r10), %ymm12, %ymm12
+vpsubw 3008(%r10), %ymm0, %ymm0
+vpaddw 8192(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3712(%r10)
+vmovdqa %ymm12, 4416(%r10)
+vmovdqa 3744(%r10), %ymm0
+vpsubw 4448(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm13, %ymm13
+vpsubw 5152(%r10), %ymm13, %ymm13
+vpsubw 3040(%r10), %ymm0, %ymm0
+vpaddw 8224(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3744(%r10)
+vmovdqa %ymm13, 4448(%r10)
+vmovdqa 3776(%r10), %ymm0
+vpsubw 4480(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm14, %ymm14
+vpsubw 5184(%r10), %ymm14, %ymm14
+vpsubw 3072(%r10), %ymm0, %ymm0
+vpaddw 8256(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3776(%r10)
+vmovdqa %ymm14, 4480(%r10)
+vmovdqa 3808(%r10), %ymm0
+vpsubw 4512(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm15, %ymm15
+vpsubw 5216(%r10), %ymm15, %ymm15
+vpsubw 3104(%r10), %ymm0, %ymm0
+vpaddw 8288(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3808(%r10)
+vmovdqa %ymm15, 4512(%r10)
+vmovdqa 3840(%r10), %ymm0
+vpsubw 4544(%r10), %ymm0, %ymm0
+vmovdqa 9024(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5248(%r10), %ymm1, %ymm1
+vpsubw 3136(%r10), %ymm0, %ymm0
+vpaddw 8320(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3840(%r10)
+vmovdqa %ymm1, 4544(%r10)
+vmovdqa 3872(%r10), %ymm0
+vpsubw 4576(%r10), %ymm0, %ymm0
+vmovdqa 9056(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5280(%r10), %ymm1, %ymm1
+vpsubw 3168(%r10), %ymm0, %ymm0
+vpaddw 8352(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3872(%r10)
+vmovdqa %ymm1, 4576(%r10)
+vmovdqa 3904(%r10), %ymm0
+vpsubw 4608(%r10), %ymm0, %ymm0
+vmovdqa 9088(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5312(%r10), %ymm1, %ymm1
+vpsubw 3200(%r10), %ymm0, %ymm0
+vpaddw 8384(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3904(%r10)
+vmovdqa %ymm1, 4608(%r10)
+vmovdqa 3936(%r10), %ymm0
+vpsubw 4640(%r10), %ymm0, %ymm0
+vmovdqa 9120(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5344(%r10), %ymm1, %ymm1
+vpsubw 3232(%r10), %ymm0, %ymm0
+vpaddw 8416(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3936(%r10)
+vmovdqa %ymm1, 4640(%r10)
+vmovdqa 3968(%r10), %ymm0
+vpsubw 4672(%r10), %ymm0, %ymm0
+vmovdqa 9152(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5376(%r10), %ymm1, %ymm1
+vpsubw 3264(%r10), %ymm0, %ymm0
+vpaddw 8448(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3968(%r10)
+vmovdqa %ymm1, 4672(%r10)
+vmovdqa 4000(%r10), %ymm0
+vpsubw 4704(%r10), %ymm0, %ymm0
+vmovdqa 9184(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5408(%r10), %ymm1, %ymm1
+vpsubw 3296(%r10), %ymm0, %ymm0
+vpaddw 8480(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4000(%r10)
+vmovdqa %ymm1, 4704(%r10)
+vmovdqa 4032(%r10), %ymm0
+vpsubw 4736(%r10), %ymm0, %ymm0
+vmovdqa 9216(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5440(%r10), %ymm1, %ymm1
+vpsubw 3328(%r10), %ymm0, %ymm0
+vpaddw 8512(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4032(%r10)
+vmovdqa %ymm1, 4736(%r10)
+vmovdqa 4064(%r10), %ymm0
+vpsubw 4768(%r10), %ymm0, %ymm0
+vmovdqa 9248(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5472(%r10), %ymm1, %ymm1
+vpsubw 3360(%r10), %ymm0, %ymm0
+vpaddw 8544(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4064(%r10)
+vmovdqa %ymm1, 4768(%r10)
+vmovdqa 4096(%r10), %ymm0
+vpsubw 4800(%r10), %ymm0, %ymm0
+vmovdqa 9280(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5504(%r10), %ymm1, %ymm1
+vpsubw 3392(%r10), %ymm0, %ymm0
+vpaddw 8576(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4096(%r10)
+vmovdqa %ymm1, 4800(%r10)
+vmovdqa 4128(%r10), %ymm0
+vpsubw 4832(%r10), %ymm0, %ymm0
+vmovdqa 9312(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5536(%r10), %ymm1, %ymm1
+vpsubw 3424(%r10), %ymm0, %ymm0
+vpaddw 8608(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4128(%r10)
+vmovdqa %ymm1, 4832(%r10)
+vmovdqa 4160(%r10), %ymm0
+vpsubw 4864(%r10), %ymm0, %ymm0
+vmovdqa 9344(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5568(%r10), %ymm1, %ymm1
+vpsubw 3456(%r10), %ymm0, %ymm0
+vpaddw 8640(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4160(%r10)
+vmovdqa %ymm1, 4864(%r10)
+vpxor %ymm1, %ymm1, %ymm1
+vmovdqa %ymm1, 5600(%r10)
+subq $32, %rsp
+vmovdqa 2816(%r10), %ymm0
+vmovdqa 2880(%r10), %ymm1
+vmovdqa 2944(%r10), %ymm2
+vmovdqa 3008(%r10), %ymm3
+vpunpcklwd 2848(%r10), %ymm0, %ymm4
+vpunpckhwd 2848(%r10), %ymm0, %ymm5
+vpunpcklwd 2912(%r10), %ymm1, %ymm6
+vpunpckhwd 2912(%r10), %ymm1, %ymm7
+vpunpcklwd 2976(%r10), %ymm2, %ymm8
+vpunpckhwd 2976(%r10), %ymm2, %ymm9
+vpunpcklwd 3040(%r10), %ymm3, %ymm10
+vpunpckhwd 3040(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 3072(%r10), %ymm0
+vmovdqa 3136(%r10), %ymm1
+vmovdqa 3200(%r10), %ymm2
+vmovdqa 3264(%r10), %ymm3
+vpunpcklwd 3104(%r10), %ymm0, %ymm12
+vpunpckhwd 3104(%r10), %ymm0, %ymm13
+vpunpcklwd 3168(%r10), %ymm1, %ymm14
+vpunpckhwd 3168(%r10), %ymm1, %ymm15
+vpunpcklwd 3232(%r10), %ymm2, %ymm0
+vpunpckhwd 3232(%r10), %ymm2, %ymm1
+vpunpcklwd 3296(%r10), %ymm3, %ymm2
+vpunpckhwd 3296(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 0(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 192(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 384(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 576(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 768(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 960(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1152(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1536(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1728(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1920(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2112(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2304(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2496(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2688(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1344(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2880(%r12)
+vmovdqa 3328(%r10), %ymm0
+vmovdqa 3392(%r10), %ymm1
+vmovdqa 3456(%r10), %ymm2
+vmovdqa 3520(%r10), %ymm3
+vpunpcklwd 3360(%r10), %ymm0, %ymm4
+vpunpckhwd 3360(%r10), %ymm0, %ymm5
+vpunpcklwd 3424(%r10), %ymm1, %ymm6
+vpunpckhwd 3424(%r10), %ymm1, %ymm7
+vpunpcklwd 3488(%r10), %ymm2, %ymm8
+vpunpckhwd 3488(%r10), %ymm2, %ymm9
+vpunpcklwd 3552(%r10), %ymm3, %ymm10
+vpunpckhwd 3552(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 3584(%r10), %ymm0
+vmovdqa 3648(%r10), %ymm1
+vmovdqa 3712(%r10), %ymm2
+vmovdqa 3776(%r10), %ymm3
+vpunpcklwd 3616(%r10), %ymm0, %ymm12
+vpunpckhwd 3616(%r10), %ymm0, %ymm13
+vpunpcklwd 3680(%r10), %ymm1, %ymm14
+vpunpckhwd 3680(%r10), %ymm1, %ymm15
+vpunpcklwd 3744(%r10), %ymm2, %ymm0
+vpunpckhwd 3744(%r10), %ymm2, %ymm1
+vpunpcklwd 3808(%r10), %ymm3, %ymm2
+vpunpckhwd 3808(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 32(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 224(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 416(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 608(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 800(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 992(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1184(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1568(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1760(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1952(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2144(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2336(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2528(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2720(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1376(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2912(%r12)
+vmovdqa 3840(%r10), %ymm0
+vmovdqa 3904(%r10), %ymm1
+vmovdqa 3968(%r10), %ymm2
+vmovdqa 4032(%r10), %ymm3
+vpunpcklwd 3872(%r10), %ymm0, %ymm4
+vpunpckhwd 3872(%r10), %ymm0, %ymm5
+vpunpcklwd 3936(%r10), %ymm1, %ymm6
+vpunpckhwd 3936(%r10), %ymm1, %ymm7
+vpunpcklwd 4000(%r10), %ymm2, %ymm8
+vpunpckhwd 4000(%r10), %ymm2, %ymm9
+vpunpcklwd 4064(%r10), %ymm3, %ymm10
+vpunpckhwd 4064(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 4096(%r10), %ymm0
+vmovdqa 4160(%r10), %ymm1
+vmovdqa 4224(%r10), %ymm2
+vmovdqa 4288(%r10), %ymm3
+vpunpcklwd 4128(%r10), %ymm0, %ymm12
+vpunpckhwd 4128(%r10), %ymm0, %ymm13
+vpunpcklwd 4192(%r10), %ymm1, %ymm14
+vpunpckhwd 4192(%r10), %ymm1, %ymm15
+vpunpcklwd 4256(%r10), %ymm2, %ymm0
+vpunpckhwd 4256(%r10), %ymm2, %ymm1
+vpunpcklwd 4320(%r10), %ymm3, %ymm2
+vpunpckhwd 4320(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 64(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 256(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 448(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 640(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 832(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1024(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1216(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1600(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1792(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1984(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2176(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2368(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2560(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2752(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1408(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2944(%r12)
+vmovdqa 4224(%r10), %ymm0
+vmovdqa 4288(%r10), %ymm1
+vmovdqa 4352(%r10), %ymm2
+vmovdqa 4416(%r10), %ymm3
+vpunpcklwd 4256(%r10), %ymm0, %ymm4
+vpunpckhwd 4256(%r10), %ymm0, %ymm5
+vpunpcklwd 4320(%r10), %ymm1, %ymm6
+vpunpckhwd 4320(%r10), %ymm1, %ymm7
+vpunpcklwd 4384(%r10), %ymm2, %ymm8
+vpunpckhwd 4384(%r10), %ymm2, %ymm9
+vpunpcklwd 4448(%r10), %ymm3, %ymm10
+vpunpckhwd 4448(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 4480(%r10), %ymm0
+vmovdqa 4544(%r10), %ymm1
+vmovdqa 4608(%r10), %ymm2
+vmovdqa 4672(%r10), %ymm3
+vpunpcklwd 4512(%r10), %ymm0, %ymm12
+vpunpckhwd 4512(%r10), %ymm0, %ymm13
+vpunpcklwd 4576(%r10), %ymm1, %ymm14
+vpunpckhwd 4576(%r10), %ymm1, %ymm15
+vpunpcklwd 4640(%r10), %ymm2, %ymm0
+vpunpckhwd 4640(%r10), %ymm2, %ymm1
+vpunpcklwd 4704(%r10), %ymm3, %ymm2
+vpunpckhwd 4704(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 96(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 288(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 480(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 672(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 864(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1056(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1248(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1632(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1824(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2016(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2208(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2400(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2592(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2784(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1440(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2976(%r12)
+vmovdqa 4736(%r10), %ymm0
+vmovdqa 4800(%r10), %ymm1
+vmovdqa 4864(%r10), %ymm2
+vmovdqa 4928(%r10), %ymm3
+vpunpcklwd 4768(%r10), %ymm0, %ymm4
+vpunpckhwd 4768(%r10), %ymm0, %ymm5
+vpunpcklwd 4832(%r10), %ymm1, %ymm6
+vpunpckhwd 4832(%r10), %ymm1, %ymm7
+vpunpcklwd 4896(%r10), %ymm2, %ymm8
+vpunpckhwd 4896(%r10), %ymm2, %ymm9
+vpunpcklwd 4960(%r10), %ymm3, %ymm10
+vpunpckhwd 4960(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 4992(%r10), %ymm0
+vmovdqa 5056(%r10), %ymm1
+vmovdqa 5120(%r10), %ymm2
+vmovdqa 5184(%r10), %ymm3
+vpunpcklwd 5024(%r10), %ymm0, %ymm12
+vpunpckhwd 5024(%r10), %ymm0, %ymm13
+vpunpcklwd 5088(%r10), %ymm1, %ymm14
+vpunpckhwd 5088(%r10), %ymm1, %ymm15
+vpunpcklwd 5152(%r10), %ymm2, %ymm0
+vpunpckhwd 5152(%r10), %ymm2, %ymm1
+vpunpcklwd 5216(%r10), %ymm3, %ymm2
+vpunpckhwd 5216(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 128(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 320(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 512(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 704(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 896(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1088(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1280(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1664(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1856(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2048(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2240(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2432(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2624(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2816(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1472(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 3008(%r12)
+vmovdqa 5248(%r10), %ymm0
+vmovdqa 5312(%r10), %ymm1
+vmovdqa 5376(%r10), %ymm2
+vmovdqa 5440(%r10), %ymm3
+vpunpcklwd 5280(%r10), %ymm0, %ymm4
+vpunpckhwd 5280(%r10), %ymm0, %ymm5
+vpunpcklwd 5344(%r10), %ymm1, %ymm6
+vpunpckhwd 5344(%r10), %ymm1, %ymm7
+vpunpcklwd 5408(%r10), %ymm2, %ymm8
+vpunpckhwd 5408(%r10), %ymm2, %ymm9
+vpunpcklwd 5472(%r10), %ymm3, %ymm10
+vpunpckhwd 5472(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 5504(%r10), %ymm0
+vmovdqa 5568(%r10), %ymm1
+vmovdqa 5632(%r10), %ymm2
+vmovdqa 5696(%r10), %ymm3
+vpunpcklwd 5536(%r10), %ymm0, %ymm12
+vpunpckhwd 5536(%r10), %ymm0, %ymm13
+vpunpcklwd 5600(%r10), %ymm1, %ymm14
+vpunpckhwd 5600(%r10), %ymm1, %ymm15
+vpunpcklwd 5664(%r10), %ymm2, %ymm0
+vpunpckhwd 5664(%r10), %ymm2, %ymm1
+vpunpcklwd 5728(%r10), %ymm3, %ymm2
+vpunpckhwd 5728(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 160(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 352(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 544(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 736(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 928(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1120(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1312(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1696(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1888(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2080(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2272(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2464(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2656(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2848(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1504(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 3040(%r12)
+addq $32, %rsp
+add $1536, %rax
+add $1536, %r11
+add $3072, %r12
+dec %ecx
+jnz karatsuba_loop_4eced63f144beffcb0247f9c6f67d165
+sub $12288, %r12
+add $9408, %rsp
+subq $2400, %rsp
+vpxor %ymm0, %ymm0, %ymm0
+vmovdqa %ymm0, 1792(%rsp)
+vmovdqa %ymm0, 1824(%rsp)
+vmovdqa %ymm0, 1856(%rsp)
+vmovdqa %ymm0, 1888(%rsp)
+vmovdqa %ymm0, 1920(%rsp)
+vmovdqa %ymm0, 1952(%rsp)
+vmovdqa %ymm0, 1984(%rsp)
+vmovdqa %ymm0, 2016(%rsp)
+vmovdqa %ymm0, 2048(%rsp)
+vmovdqa %ymm0, 2080(%rsp)
+vmovdqa %ymm0, 2112(%rsp)
+vmovdqa %ymm0, 2144(%rsp)
+vmovdqa %ymm0, 2176(%rsp)
+vmovdqa %ymm0, 2208(%rsp)
+vmovdqa %ymm0, 2240(%rsp)
+vmovdqa %ymm0, 2272(%rsp)
+vmovdqa %ymm0, 2304(%rsp)
+vmovdqa %ymm0, 2336(%rsp)
+vmovdqa %ymm0, 2368(%rsp)
+vmovdqa %ymm0, 2400(%rsp)
+vmovdqa %ymm0, 2432(%rsp)
+vmovdqa %ymm0, 2464(%rsp)
+vmovdqa %ymm0, 2496(%rsp)
+vmovdqa %ymm0, 2528(%rsp)
+vmovdqa %ymm0, 2560(%rsp)
+vmovdqa %ymm0, 2592(%rsp)
+vmovdqa %ymm0, 2624(%rsp)
+vmovdqa %ymm0, 2656(%rsp)
+vmovdqa %ymm0, 2688(%rsp)
+vmovdqa %ymm0, 2720(%rsp)
+vmovdqa %ymm0, 2752(%rsp)
+vmovdqa %ymm0, 2784(%rsp)
+vmovdqa const729(%rip), %ymm15
+vmovdqa const3_inv(%rip), %ymm14
+vmovdqa const5_inv(%rip), %ymm13
+vmovdqa const9(%rip), %ymm12
+vmovdqa 96(%r12), %ymm0
+vpsubw 192(%r12), %ymm0, %ymm0
+vmovdqa 480(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 288(%r12), %ymm1, %ymm1
+vpsubw 0(%r12), %ymm0, %ymm0
+vpaddw 384(%r12), %ymm0, %ymm0
+vmovdqa 672(%r12), %ymm2
+vpsubw 768(%r12), %ymm2, %ymm2
+vmovdqa 1056(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 864(%r12), %ymm3, %ymm3
+vpsubw 576(%r12), %ymm2, %ymm2
+vpaddw 960(%r12), %ymm2, %ymm2
+vmovdqa 1248(%r12), %ymm4
+vpsubw 1344(%r12), %ymm4, %ymm4
+vmovdqa 1632(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 1440(%r12), %ymm5, %ymm5
+vpsubw 1152(%r12), %ymm4, %ymm4
+vpaddw 1536(%r12), %ymm4, %ymm4
+vpsubw 576(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 0(%r12), %ymm1, %ymm1
+vpaddw 1152(%r12), %ymm1, %ymm1
+vmovdqa 288(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 1440(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 864(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 0(%r12), %ymm8
+vmovdqa 864(%r12), %ymm9
+vmovdqa %ymm8, 0(%rsp)
+vmovdqa %ymm0, 32(%rsp)
+vmovdqa %ymm1, 64(%rsp)
+vmovdqa %ymm7, 96(%rsp)
+vmovdqa %ymm5, 128(%rsp)
+vmovdqa %ymm2, 160(%rsp)
+vmovdqa %ymm3, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa 1824(%r12), %ymm0
+vpsubw 1920(%r12), %ymm0, %ymm0
+vmovdqa 2208(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 2016(%r12), %ymm1, %ymm1
+vpsubw 1728(%r12), %ymm0, %ymm0
+vpaddw 2112(%r12), %ymm0, %ymm0
+vmovdqa 2400(%r12), %ymm2
+vpsubw 2496(%r12), %ymm2, %ymm2
+vmovdqa 2784(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 2592(%r12), %ymm3, %ymm3
+vpsubw 2304(%r12), %ymm2, %ymm2
+vpaddw 2688(%r12), %ymm2, %ymm2
+vmovdqa 2976(%r12), %ymm4
+vpsubw 3072(%r12), %ymm4, %ymm4
+vmovdqa 3360(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 3168(%r12), %ymm5, %ymm5
+vpsubw 2880(%r12), %ymm4, %ymm4
+vpaddw 3264(%r12), %ymm4, %ymm4
+vpsubw 2304(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 1728(%r12), %ymm1, %ymm1
+vpaddw 2880(%r12), %ymm1, %ymm1
+vmovdqa 2016(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 3168(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 2592(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 1728(%r12), %ymm8
+vmovdqa 2592(%r12), %ymm9
+vmovdqa %ymm8, 256(%rsp)
+vmovdqa %ymm0, 288(%rsp)
+vmovdqa %ymm1, 320(%rsp)
+vmovdqa %ymm7, 352(%rsp)
+vmovdqa %ymm5, 384(%rsp)
+vmovdqa %ymm2, 416(%rsp)
+vmovdqa %ymm3, 448(%rsp)
+vmovdqa %ymm9, 480(%rsp)
+vmovdqa 3552(%r12), %ymm0
+vpsubw 3648(%r12), %ymm0, %ymm0
+vmovdqa 3936(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3744(%r12), %ymm1, %ymm1
+vpsubw 3456(%r12), %ymm0, %ymm0
+vpaddw 3840(%r12), %ymm0, %ymm0
+vmovdqa 4128(%r12), %ymm2
+vpsubw 4224(%r12), %ymm2, %ymm2
+vmovdqa 4512(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 4320(%r12), %ymm3, %ymm3
+vpsubw 4032(%r12), %ymm2, %ymm2
+vpaddw 4416(%r12), %ymm2, %ymm2
+vmovdqa 4704(%r12), %ymm4
+vpsubw 4800(%r12), %ymm4, %ymm4
+vmovdqa 5088(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 4896(%r12), %ymm5, %ymm5
+vpsubw 4608(%r12), %ymm4, %ymm4
+vpaddw 4992(%r12), %ymm4, %ymm4
+vpsubw 4032(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 3456(%r12), %ymm1, %ymm1
+vpaddw 4608(%r12), %ymm1, %ymm1
+vmovdqa 3744(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 4896(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 4320(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 3456(%r12), %ymm8
+vmovdqa 4320(%r12), %ymm9
+vmovdqa %ymm8, 512(%rsp)
+vmovdqa %ymm0, 544(%rsp)
+vmovdqa %ymm1, 576(%rsp)
+vmovdqa %ymm7, 608(%rsp)
+vmovdqa %ymm5, 640(%rsp)
+vmovdqa %ymm2, 672(%rsp)
+vmovdqa %ymm3, 704(%rsp)
+vmovdqa %ymm9, 736(%rsp)
+vmovdqa 5280(%r12), %ymm0
+vpsubw 5376(%r12), %ymm0, %ymm0
+vmovdqa 5664(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5472(%r12), %ymm1, %ymm1
+vpsubw 5184(%r12), %ymm0, %ymm0
+vpaddw 5568(%r12), %ymm0, %ymm0
+vmovdqa 5856(%r12), %ymm2
+vpsubw 5952(%r12), %ymm2, %ymm2
+vmovdqa 6240(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 6048(%r12), %ymm3, %ymm3
+vpsubw 5760(%r12), %ymm2, %ymm2
+vpaddw 6144(%r12), %ymm2, %ymm2
+vmovdqa 6432(%r12), %ymm4
+vpsubw 6528(%r12), %ymm4, %ymm4
+vmovdqa 6816(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 6624(%r12), %ymm5, %ymm5
+vpsubw 6336(%r12), %ymm4, %ymm4
+vpaddw 6720(%r12), %ymm4, %ymm4
+vpsubw 5760(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 5184(%r12), %ymm1, %ymm1
+vpaddw 6336(%r12), %ymm1, %ymm1
+vmovdqa 5472(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 6624(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 6048(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 5184(%r12), %ymm8
+vmovdqa 6048(%r12), %ymm9
+vmovdqa %ymm8, 768(%rsp)
+vmovdqa %ymm0, 800(%rsp)
+vmovdqa %ymm1, 832(%rsp)
+vmovdqa %ymm7, 864(%rsp)
+vmovdqa %ymm5, 896(%rsp)
+vmovdqa %ymm2, 928(%rsp)
+vmovdqa %ymm3, 960(%rsp)
+vmovdqa %ymm9, 992(%rsp)
+vmovdqa 7008(%r12), %ymm0
+vpsubw 7104(%r12), %ymm0, %ymm0
+vmovdqa 7392(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 7200(%r12), %ymm1, %ymm1
+vpsubw 6912(%r12), %ymm0, %ymm0
+vpaddw 7296(%r12), %ymm0, %ymm0
+vmovdqa 7584(%r12), %ymm2
+vpsubw 7680(%r12), %ymm2, %ymm2
+vmovdqa 7968(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 7776(%r12), %ymm3, %ymm3
+vpsubw 7488(%r12), %ymm2, %ymm2
+vpaddw 7872(%r12), %ymm2, %ymm2
+vmovdqa 8160(%r12), %ymm4
+vpsubw 8256(%r12), %ymm4, %ymm4
+vmovdqa 8544(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 8352(%r12), %ymm5, %ymm5
+vpsubw 8064(%r12), %ymm4, %ymm4
+vpaddw 8448(%r12), %ymm4, %ymm4
+vpsubw 7488(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 6912(%r12), %ymm1, %ymm1
+vpaddw 8064(%r12), %ymm1, %ymm1
+vmovdqa 7200(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 8352(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 7776(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 6912(%r12), %ymm8
+vmovdqa 7776(%r12), %ymm9
+vmovdqa %ymm8, 1024(%rsp)
+vmovdqa %ymm0, 1056(%rsp)
+vmovdqa %ymm1, 1088(%rsp)
+vmovdqa %ymm7, 1120(%rsp)
+vmovdqa %ymm5, 1152(%rsp)
+vmovdqa %ymm2, 1184(%rsp)
+vmovdqa %ymm3, 1216(%rsp)
+vmovdqa %ymm9, 1248(%rsp)
+vmovdqa 8736(%r12), %ymm0
+vpsubw 8832(%r12), %ymm0, %ymm0
+vmovdqa 9120(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 8928(%r12), %ymm1, %ymm1
+vpsubw 8640(%r12), %ymm0, %ymm0
+vpaddw 9024(%r12), %ymm0, %ymm0
+vmovdqa 9312(%r12), %ymm2
+vpsubw 9408(%r12), %ymm2, %ymm2
+vmovdqa 9696(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 9504(%r12), %ymm3, %ymm3
+vpsubw 9216(%r12), %ymm2, %ymm2
+vpaddw 9600(%r12), %ymm2, %ymm2
+vmovdqa 9888(%r12), %ymm4
+vpsubw 9984(%r12), %ymm4, %ymm4
+vmovdqa 10272(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 10080(%r12), %ymm5, %ymm5
+vpsubw 9792(%r12), %ymm4, %ymm4
+vpaddw 10176(%r12), %ymm4, %ymm4
+vpsubw 9216(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 8640(%r12), %ymm1, %ymm1
+vpaddw 9792(%r12), %ymm1, %ymm1
+vmovdqa 8928(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 10080(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 9504(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 8640(%r12), %ymm8
+vmovdqa 9504(%r12), %ymm9
+vmovdqa %ymm8, 1280(%rsp)
+vmovdqa %ymm0, 1312(%rsp)
+vmovdqa %ymm1, 1344(%rsp)
+vmovdqa %ymm7, 1376(%rsp)
+vmovdqa %ymm5, 1408(%rsp)
+vmovdqa %ymm2, 1440(%rsp)
+vmovdqa %ymm3, 1472(%rsp)
+vmovdqa %ymm9, 1504(%rsp)
+vmovdqa 10464(%r12), %ymm0
+vpsubw 10560(%r12), %ymm0, %ymm0
+vmovdqa 10848(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 10656(%r12), %ymm1, %ymm1
+vpsubw 10368(%r12), %ymm0, %ymm0
+vpaddw 10752(%r12), %ymm0, %ymm0
+vmovdqa 11040(%r12), %ymm2
+vpsubw 11136(%r12), %ymm2, %ymm2
+vmovdqa 11424(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 11232(%r12), %ymm3, %ymm3
+vpsubw 10944(%r12), %ymm2, %ymm2
+vpaddw 11328(%r12), %ymm2, %ymm2
+vmovdqa 11616(%r12), %ymm4
+vpsubw 11712(%r12), %ymm4, %ymm4
+vmovdqa 12000(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 11808(%r12), %ymm5, %ymm5
+vpsubw 11520(%r12), %ymm4, %ymm4
+vpaddw 11904(%r12), %ymm4, %ymm4
+vpsubw 10944(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 10368(%r12), %ymm1, %ymm1
+vpaddw 11520(%r12), %ymm1, %ymm1
+vmovdqa 10656(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 11808(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 11232(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 10368(%r12), %ymm8
+vmovdqa 11232(%r12), %ymm9
+vmovdqa %ymm8, 1536(%rsp)
+vmovdqa %ymm0, 1568(%rsp)
+vmovdqa %ymm1, 1600(%rsp)
+vmovdqa %ymm7, 1632(%rsp)
+vmovdqa %ymm5, 1664(%rsp)
+vmovdqa %ymm2, 1696(%rsp)
+vmovdqa %ymm3, 1728(%rsp)
+vmovdqa %ymm9, 1760(%rsp)
+vmovdqa 0(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm9, %ymm9
+vmovdqa 256(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm7
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 512(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm7, %ymm4
+vpaddd %ymm6, %ymm8, %ymm3
+vpsubd %ymm10, %ymm4, %ymm4
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm5, %ymm7, %ymm5
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1536(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm8
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm7, %ymm3, %ymm3
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm4, %ymm3
+vmovdqa 768(%rsp), %ymm4
+vpaddw 1024(%rsp), %ymm4, %ymm7
+vpsubw 1024(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm11, %ymm8
+vpsubw %ymm8, %ymm7, %ymm8
+vpsllw $7, %ymm5, %ymm7
+vpsubw %ymm7, %ymm8, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm3, %ymm7, %ymm7
+vmovdqa 1280(%rsp), %ymm8
+vpsubw %ymm11, %ymm8, %ymm8
+vpmullw %ymm15, %ymm5, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm3
+vpmullw %ymm12, %ymm7, %ymm8
+vpaddw %ymm8, %ymm3, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm9
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm10
+vpor %ymm10, %ymm7, %ymm7
+vpaddw %ymm7, %ymm11, %ymm11
+vmovdqa %xmm9, 2048(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm9
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm10
+vpor %ymm10, %ymm8, %ymm8
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm9, 2304(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm9
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm10
+vpor %ymm10, %ymm5, %ymm5
+vpaddw %ymm5, %ymm3, %ymm3
+vmovdqa %xmm9, 2560(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 0(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 352(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 704(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %ymm4, 1056(%rdi)
+vmovdqa 32(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm8
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm7, %ymm7
+vmovdqa 288(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm3
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 544(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm3, %ymm9
+vpaddd %ymm6, %ymm4, %ymm10
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm11, %ymm3, %ymm11
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1568(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm3, %ymm10, %ymm10
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpackusdw %ymm10, %ymm9, %ymm10
+vmovdqa 800(%rsp), %ymm9
+vpaddw 1056(%rsp), %ymm9, %ymm3
+vpsubw 1056(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm5, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpsllw $7, %ymm11, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vmovdqa 1312(%rsp), %ymm4
+vpsubw %ymm5, %ymm4, %ymm4
+vpmullw %ymm15, %ymm11, %ymm7
+vpsubw %ymm7, %ymm4, %ymm7
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpmullw %ymm12, %ymm3, %ymm4
+vpaddw %ymm4, %ymm10, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm7, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm7
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm8
+vpor %ymm8, %ymm3, %ymm3
+vpaddw %ymm3, %ymm5, %ymm5
+vmovdqa %xmm7, 2080(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm7
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm7, 2336(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm7
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm8
+vpor %ymm8, %ymm11, %ymm11
+vpaddw %ymm11, %ymm10, %ymm10
+vmovdqa %xmm7, 2592(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 88(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 440(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 792(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1144(%rdi)
+vmovdqa 64(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm3, %ymm3
+vmovdqa 320(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm10
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 576(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm10, %ymm7
+vpaddd %ymm6, %ymm9, %ymm8
+vpsubd %ymm4, %ymm7, %ymm7
+vpsubd %ymm3, %ymm8, %ymm8
+vpsubd %ymm5, %ymm10, %ymm5
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1600(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpsubd %ymm9, %ymm7, %ymm7
+vpsubd %ymm10, %ymm8, %ymm8
+vpsrld $1, %ymm7, %ymm7
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm7, %ymm8
+vmovdqa 832(%rsp), %ymm7
+vpaddw 1088(%rsp), %ymm7, %ymm10
+vpsubw 1088(%rsp), %ymm7, %ymm7
+vpsrlw $2, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsllw $1, %ymm11, %ymm9
+vpsubw %ymm9, %ymm10, %ymm9
+vpsllw $7, %ymm5, %ymm10
+vpsubw %ymm10, %ymm9, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm8, %ymm10, %ymm10
+vmovdqa 1344(%rsp), %ymm9
+vpsubw %ymm11, %ymm9, %ymm9
+vpmullw %ymm15, %ymm5, %ymm3
+vpsubw %ymm3, %ymm9, %ymm3
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm10, %ymm8, %ymm8
+vpmullw %ymm12, %ymm10, %ymm9
+vpaddw %ymm9, %ymm8, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm7, %ymm9, %ymm9
+vpsubw %ymm9, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm3
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm10, %ymm10
+vpaddw %ymm10, %ymm11, %ymm11
+vmovdqa %xmm3, 2112(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm3
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm3, 2368(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm3
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm5, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vmovdqa %xmm3, 2624(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 176(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 528(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 880(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 1232(%rdi)
+vmovdqa 96(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm10, %ymm10
+vmovdqa 352(%rsp), %ymm7
+vpunpcklwd const0(%rip), %ymm7, %ymm8
+vpunpckhwd const0(%rip), %ymm7, %ymm7
+vmovdqa 608(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm8, %ymm3
+vpaddd %ymm6, %ymm7, %ymm4
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm10, %ymm4, %ymm4
+vpsubd %ymm11, %ymm8, %ymm11
+vpsubd %ymm6, %ymm7, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1632(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm7
+vpunpckhwd const0(%rip), %ymm11, %ymm8
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm7, %ymm3, %ymm3
+vpsubd %ymm8, %ymm4, %ymm4
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm3, %ymm4
+vmovdqa 864(%rsp), %ymm3
+vpaddw 1120(%rsp), %ymm3, %ymm8
+vpsubw 1120(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm5, %ymm7
+vpsubw %ymm7, %ymm8, %ymm7
+vpsllw $7, %ymm11, %ymm8
+vpsubw %ymm8, %ymm7, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vmovdqa 1376(%rsp), %ymm7
+vpsubw %ymm5, %ymm7, %ymm7
+vpmullw %ymm15, %ymm11, %ymm10
+vpsubw %ymm10, %ymm7, %ymm10
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpmullw %ymm12, %ymm8, %ymm7
+vpaddw %ymm7, %ymm4, %ymm7
+vpmullw %ymm12, %ymm7, %ymm7
+vpsubw %ymm7, %ymm10, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm3, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm10
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm9
+vpor %ymm9, %ymm8, %ymm8
+vpaddw %ymm8, %ymm5, %ymm5
+vmovdqa %xmm10, 2144(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm10
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm9
+vpor %ymm9, %ymm7, %ymm7
+vpaddw %ymm7, %ymm6, %ymm6
+vmovdqa %xmm10, 2400(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm10
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm9
+vpor %ymm9, %ymm11, %ymm11
+vpaddw %ymm11, %ymm4, %ymm4
+vmovdqa %xmm10, 2656(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 264(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 616(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %ymm4, 968(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1320(%rdi)
+vmovdqa 128(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm7
+vpunpckhwd const0(%rip), %ymm11, %ymm8
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm8, %ymm8
+vmovdqa 384(%rsp), %ymm3
+vpunpcklwd const0(%rip), %ymm3, %ymm4
+vpunpckhwd const0(%rip), %ymm3, %ymm3
+vmovdqa 640(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm4, %ymm10
+vpaddd %ymm6, %ymm3, %ymm9
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm5, %ymm4, %ymm5
+vpsubd %ymm6, %ymm3, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1664(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm4, %ymm4
+vpsubd %ymm3, %ymm10, %ymm10
+vpsubd %ymm4, %ymm9, %ymm9
+vpsrld $1, %ymm10, %ymm10
+vpsrld $1, %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpackusdw %ymm9, %ymm10, %ymm9
+vmovdqa 896(%rsp), %ymm10
+vpaddw 1152(%rsp), %ymm10, %ymm4
+vpsubw 1152(%rsp), %ymm10, %ymm10
+vpsrlw $2, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsllw $1, %ymm11, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpsllw $7, %ymm5, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vmovdqa 1408(%rsp), %ymm3
+vpsubw %ymm11, %ymm3, %ymm3
+vpmullw %ymm15, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpmullw %ymm12, %ymm4, %ymm3
+vpaddw %ymm3, %ymm9, %ymm3
+vpmullw %ymm12, %ymm3, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpmullw %ymm13, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vmovdqu 352(%rdi), %ymm8
+vmovdqu 704(%rdi), %ymm7
+vmovdqu 1056(%rdi), %ymm2
+vpaddw %ymm11, %ymm8, %ymm11
+vpaddw %ymm6, %ymm7, %ymm6
+vpaddw %ymm9, %ymm2, %ymm9
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm2
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm10, %ymm10
+vmovdqu 0(%rdi), %ymm7
+vpaddw %ymm10, %ymm7, %ymm7
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 0(%rdi)
+vmovdqa %xmm2, 1920(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm2
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm4, %ymm4
+vpaddw %ymm4, %ymm11, %ymm11
+vmovdqa %xmm2, 2176(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm2
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm3, %ymm3
+vpaddw %ymm3, %ymm6, %ymm6
+vmovdqa %xmm2, 2432(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm2
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm5, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vmovdqa %xmm2, 2688(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 352(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 704(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1056(%rdi)
+vmovdqa 160(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm4, %ymm4
+vmovdqa 416(%rsp), %ymm10
+vpunpcklwd const0(%rip), %ymm10, %ymm9
+vpunpckhwd const0(%rip), %ymm10, %ymm10
+vmovdqa 672(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm9, %ymm2
+vpaddd %ymm6, %ymm10, %ymm7
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm4, %ymm7, %ymm7
+vpsubd %ymm11, %ymm9, %ymm11
+vpsubd %ymm6, %ymm10, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1696(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm9, %ymm9
+vpsubd %ymm10, %ymm2, %ymm2
+vpsubd %ymm9, %ymm7, %ymm7
+vpsrld $1, %ymm2, %ymm2
+vpsrld $1, %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpackusdw %ymm7, %ymm2, %ymm7
+vmovdqa 928(%rsp), %ymm2
+vpaddw 1184(%rsp), %ymm2, %ymm9
+vpsubw 1184(%rsp), %ymm2, %ymm2
+vpsrlw $2, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsllw $1, %ymm5, %ymm10
+vpsubw %ymm10, %ymm9, %ymm10
+vpsllw $7, %ymm11, %ymm9
+vpsubw %ymm9, %ymm10, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm7, %ymm9, %ymm9
+vmovdqa 1440(%rsp), %ymm10
+vpsubw %ymm5, %ymm10, %ymm10
+vpmullw %ymm15, %ymm11, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm9, %ymm7, %ymm7
+vpmullw %ymm12, %ymm9, %ymm10
+vpaddw %ymm10, %ymm7, %ymm10
+vpmullw %ymm12, %ymm10, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpmullw %ymm13, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vmovdqu 440(%rdi), %ymm4
+vmovdqu 792(%rdi), %ymm3
+vmovdqu 1144(%rdi), %ymm8
+vpaddw %ymm5, %ymm4, %ymm5
+vpaddw %ymm6, %ymm3, %ymm6
+vpaddw %ymm7, %ymm8, %ymm7
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm8
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm2, %ymm2
+vmovdqu 88(%rdi), %ymm3
+vpaddw %ymm2, %ymm3, %ymm3
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 88(%rdi)
+vmovdqa %xmm8, 1952(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm8
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm9, %ymm9
+vpaddw %ymm9, %ymm5, %ymm5
+vmovdqa %xmm8, 2208(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm8
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm10, %ymm10
+vpaddw %ymm10, %ymm6, %ymm6
+vmovdqa %xmm8, 2464(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm8
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm11, %ymm11
+vpaddw %ymm11, %ymm7, %ymm7
+vmovdqa %xmm8, 2720(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 440(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 792(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 1144(%rdi)
+vmovdqa 192(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm9, %ymm9
+vmovdqa 448(%rsp), %ymm2
+vpunpcklwd const0(%rip), %ymm2, %ymm7
+vpunpckhwd const0(%rip), %ymm2, %ymm2
+vmovdqa 704(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm7, %ymm8
+vpaddd %ymm6, %ymm2, %ymm3
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm5, %ymm7, %ymm5
+vpsubd %ymm6, %ymm2, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1728(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm2, %ymm8, %ymm8
+vpsubd %ymm7, %ymm3, %ymm3
+vpsrld $1, %ymm8, %ymm8
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm8, %ymm3
+vmovdqa 960(%rsp), %ymm8
+vpaddw 1216(%rsp), %ymm8, %ymm7
+vpsubw 1216(%rsp), %ymm8, %ymm8
+vpsrlw $2, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsllw $1, %ymm11, %ymm2
+vpsubw %ymm2, %ymm7, %ymm2
+vpsllw $7, %ymm5, %ymm7
+vpsubw %ymm7, %ymm2, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm3, %ymm7, %ymm7
+vmovdqa 1472(%rsp), %ymm2
+vpsubw %ymm11, %ymm2, %ymm2
+vpmullw %ymm15, %ymm5, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm3
+vpmullw %ymm12, %ymm7, %ymm2
+vpaddw %ymm2, %ymm3, %ymm2
+vpmullw %ymm12, %ymm2, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpmullw %ymm13, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vmovdqu 528(%rdi), %ymm9
+vmovdqu 880(%rdi), %ymm10
+vmovdqu 1232(%rdi), %ymm4
+vpaddw %ymm11, %ymm9, %ymm11
+vpaddw %ymm6, %ymm10, %ymm6
+vpaddw %ymm3, %ymm4, %ymm3
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm4
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm8, %ymm8
+vmovdqu 176(%rdi), %ymm10
+vpaddw %ymm8, %ymm10, %ymm10
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 176(%rdi)
+vmovdqa %xmm4, 1984(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm4
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm7, %ymm7
+vpaddw %ymm7, %ymm11, %ymm11
+vmovdqa %xmm4, 2240(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm4
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm2, %ymm2
+vpaddw %ymm2, %ymm6, %ymm6
+vmovdqa %xmm4, 2496(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm4
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm5, %ymm5
+vpaddw %ymm5, %ymm3, %ymm3
+vmovdqa %xmm4, 2752(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 528(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 880(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1232(%rdi)
+vmovdqa 224(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm7, %ymm7
+vmovdqa 480(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm3
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 736(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm3, %ymm4
+vpaddd %ymm6, %ymm8, %ymm10
+vpsubd %ymm2, %ymm4, %ymm4
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm11, %ymm3, %ymm11
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1760(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm3, %ymm10, %ymm10
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpackusdw %ymm10, %ymm4, %ymm10
+vmovdqa 992(%rsp), %ymm4
+vpaddw 1248(%rsp), %ymm4, %ymm3
+vpsubw 1248(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpsllw $7, %ymm11, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vmovdqa 1504(%rsp), %ymm8
+vpsubw %ymm5, %ymm8, %ymm8
+vpmullw %ymm15, %ymm11, %ymm7
+vpsubw %ymm7, %ymm8, %ymm7
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpmullw %ymm12, %ymm3, %ymm8
+vpaddw %ymm8, %ymm10, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vmovdqu 616(%rdi), %ymm7
+vmovdqu 968(%rdi), %ymm2
+vmovdqu 1320(%rdi), %ymm9
+vpaddw %ymm5, %ymm7, %ymm5
+vpaddw %ymm6, %ymm2, %ymm6
+vpaddw %ymm10, %ymm9, %ymm10
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm9
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm4, %ymm4
+vmovdqu 264(%rdi), %ymm2
+vpaddw %ymm4, %ymm2, %ymm2
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 264(%rdi)
+vmovdqa %xmm9, 2016(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm9
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm3, %ymm3
+vpaddw %ymm3, %ymm5, %ymm5
+vmovdqa %xmm9, 2272(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm9
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm8, %ymm8
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm9, 2528(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm9
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm11, %ymm11
+vpaddw %ymm11, %ymm10, %ymm10
+vmovdqa %xmm9, 2784(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 616(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 968(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 1320(%rdi)
+vmovdqa 128(%r12), %ymm0
+vpsubw 224(%r12), %ymm0, %ymm0
+vmovdqa 512(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 320(%r12), %ymm1, %ymm1
+vpsubw 32(%r12), %ymm0, %ymm0
+vpaddw 416(%r12), %ymm0, %ymm0
+vmovdqa 704(%r12), %ymm2
+vpsubw 800(%r12), %ymm2, %ymm2
+vmovdqa 1088(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 896(%r12), %ymm3, %ymm3
+vpsubw 608(%r12), %ymm2, %ymm2
+vpaddw 992(%r12), %ymm2, %ymm2
+vmovdqa 1280(%r12), %ymm4
+vpsubw 1376(%r12), %ymm4, %ymm4
+vmovdqa 1664(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 1472(%r12), %ymm5, %ymm5
+vpsubw 1184(%r12), %ymm4, %ymm4
+vpaddw 1568(%r12), %ymm4, %ymm4
+vpsubw 608(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 32(%r12), %ymm1, %ymm1
+vpaddw 1184(%r12), %ymm1, %ymm1
+vmovdqa 320(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 1472(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 896(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 32(%r12), %ymm8
+vmovdqa 896(%r12), %ymm9
+vmovdqa %ymm8, 0(%rsp)
+vmovdqa %ymm0, 32(%rsp)
+vmovdqa %ymm1, 64(%rsp)
+vmovdqa %ymm7, 96(%rsp)
+vmovdqa %ymm5, 128(%rsp)
+vmovdqa %ymm2, 160(%rsp)
+vmovdqa %ymm3, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa 1856(%r12), %ymm0
+vpsubw 1952(%r12), %ymm0, %ymm0
+vmovdqa 2240(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 2048(%r12), %ymm1, %ymm1
+vpsubw 1760(%r12), %ymm0, %ymm0
+vpaddw 2144(%r12), %ymm0, %ymm0
+vmovdqa 2432(%r12), %ymm2
+vpsubw 2528(%r12), %ymm2, %ymm2
+vmovdqa 2816(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 2624(%r12), %ymm3, %ymm3
+vpsubw 2336(%r12), %ymm2, %ymm2
+vpaddw 2720(%r12), %ymm2, %ymm2
+vmovdqa 3008(%r12), %ymm4
+vpsubw 3104(%r12), %ymm4, %ymm4
+vmovdqa 3392(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 3200(%r12), %ymm5, %ymm5
+vpsubw 2912(%r12), %ymm4, %ymm4
+vpaddw 3296(%r12), %ymm4, %ymm4
+vpsubw 2336(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 1760(%r12), %ymm1, %ymm1
+vpaddw 2912(%r12), %ymm1, %ymm1
+vmovdqa 2048(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 3200(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 2624(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 1760(%r12), %ymm8
+vmovdqa 2624(%r12), %ymm9
+vmovdqa %ymm8, 256(%rsp)
+vmovdqa %ymm0, 288(%rsp)
+vmovdqa %ymm1, 320(%rsp)
+vmovdqa %ymm7, 352(%rsp)
+vmovdqa %ymm5, 384(%rsp)
+vmovdqa %ymm2, 416(%rsp)
+vmovdqa %ymm3, 448(%rsp)
+vmovdqa %ymm9, 480(%rsp)
+vmovdqa 3584(%r12), %ymm0
+vpsubw 3680(%r12), %ymm0, %ymm0
+vmovdqa 3968(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3776(%r12), %ymm1, %ymm1
+vpsubw 3488(%r12), %ymm0, %ymm0
+vpaddw 3872(%r12), %ymm0, %ymm0
+vmovdqa 4160(%r12), %ymm2
+vpsubw 4256(%r12), %ymm2, %ymm2
+vmovdqa 4544(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 4352(%r12), %ymm3, %ymm3
+vpsubw 4064(%r12), %ymm2, %ymm2
+vpaddw 4448(%r12), %ymm2, %ymm2
+vmovdqa 4736(%r12), %ymm4
+vpsubw 4832(%r12), %ymm4, %ymm4
+vmovdqa 5120(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 4928(%r12), %ymm5, %ymm5
+vpsubw 4640(%r12), %ymm4, %ymm4
+vpaddw 5024(%r12), %ymm4, %ymm4
+vpsubw 4064(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 3488(%r12), %ymm1, %ymm1
+vpaddw 4640(%r12), %ymm1, %ymm1
+vmovdqa 3776(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 4928(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 4352(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 3488(%r12), %ymm8
+vmovdqa 4352(%r12), %ymm9
+vmovdqa %ymm8, 512(%rsp)
+vmovdqa %ymm0, 544(%rsp)
+vmovdqa %ymm1, 576(%rsp)
+vmovdqa %ymm7, 608(%rsp)
+vmovdqa %ymm5, 640(%rsp)
+vmovdqa %ymm2, 672(%rsp)
+vmovdqa %ymm3, 704(%rsp)
+vmovdqa %ymm9, 736(%rsp)
+vmovdqa 5312(%r12), %ymm0
+vpsubw 5408(%r12), %ymm0, %ymm0
+vmovdqa 5696(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5504(%r12), %ymm1, %ymm1
+vpsubw 5216(%r12), %ymm0, %ymm0
+vpaddw 5600(%r12), %ymm0, %ymm0
+vmovdqa 5888(%r12), %ymm2
+vpsubw 5984(%r12), %ymm2, %ymm2
+vmovdqa 6272(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 6080(%r12), %ymm3, %ymm3
+vpsubw 5792(%r12), %ymm2, %ymm2
+vpaddw 6176(%r12), %ymm2, %ymm2
+vmovdqa 6464(%r12), %ymm4
+vpsubw 6560(%r12), %ymm4, %ymm4
+vmovdqa 6848(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 6656(%r12), %ymm5, %ymm5
+vpsubw 6368(%r12), %ymm4, %ymm4
+vpaddw 6752(%r12), %ymm4, %ymm4
+vpsubw 5792(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 5216(%r12), %ymm1, %ymm1
+vpaddw 6368(%r12), %ymm1, %ymm1
+vmovdqa 5504(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 6656(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 6080(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 5216(%r12), %ymm8
+vmovdqa 6080(%r12), %ymm9
+vmovdqa %ymm8, 768(%rsp)
+vmovdqa %ymm0, 800(%rsp)
+vmovdqa %ymm1, 832(%rsp)
+vmovdqa %ymm7, 864(%rsp)
+vmovdqa %ymm5, 896(%rsp)
+vmovdqa %ymm2, 928(%rsp)
+vmovdqa %ymm3, 960(%rsp)
+vmovdqa %ymm9, 992(%rsp)
+vmovdqa 7040(%r12), %ymm0
+vpsubw 7136(%r12), %ymm0, %ymm0
+vmovdqa 7424(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 7232(%r12), %ymm1, %ymm1
+vpsubw 6944(%r12), %ymm0, %ymm0
+vpaddw 7328(%r12), %ymm0, %ymm0
+vmovdqa 7616(%r12), %ymm2
+vpsubw 7712(%r12), %ymm2, %ymm2
+vmovdqa 8000(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 7808(%r12), %ymm3, %ymm3
+vpsubw 7520(%r12), %ymm2, %ymm2
+vpaddw 7904(%r12), %ymm2, %ymm2
+vmovdqa 8192(%r12), %ymm4
+vpsubw 8288(%r12), %ymm4, %ymm4
+vmovdqa 8576(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 8384(%r12), %ymm5, %ymm5
+vpsubw 8096(%r12), %ymm4, %ymm4
+vpaddw 8480(%r12), %ymm4, %ymm4
+vpsubw 7520(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 6944(%r12), %ymm1, %ymm1
+vpaddw 8096(%r12), %ymm1, %ymm1
+vmovdqa 7232(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 8384(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 7808(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 6944(%r12), %ymm8
+vmovdqa 7808(%r12), %ymm9
+vmovdqa %ymm8, 1024(%rsp)
+vmovdqa %ymm0, 1056(%rsp)
+vmovdqa %ymm1, 1088(%rsp)
+vmovdqa %ymm7, 1120(%rsp)
+vmovdqa %ymm5, 1152(%rsp)
+vmovdqa %ymm2, 1184(%rsp)
+vmovdqa %ymm3, 1216(%rsp)
+vmovdqa %ymm9, 1248(%rsp)
+vmovdqa 8768(%r12), %ymm0
+vpsubw 8864(%r12), %ymm0, %ymm0
+vmovdqa 9152(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 8960(%r12), %ymm1, %ymm1
+vpsubw 8672(%r12), %ymm0, %ymm0
+vpaddw 9056(%r12), %ymm0, %ymm0
+vmovdqa 9344(%r12), %ymm2
+vpsubw 9440(%r12), %ymm2, %ymm2
+vmovdqa 9728(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 9536(%r12), %ymm3, %ymm3
+vpsubw 9248(%r12), %ymm2, %ymm2
+vpaddw 9632(%r12), %ymm2, %ymm2
+vmovdqa 9920(%r12), %ymm4
+vpsubw 10016(%r12), %ymm4, %ymm4
+vmovdqa 10304(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 10112(%r12), %ymm5, %ymm5
+vpsubw 9824(%r12), %ymm4, %ymm4
+vpaddw 10208(%r12), %ymm4, %ymm4
+vpsubw 9248(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 8672(%r12), %ymm1, %ymm1
+vpaddw 9824(%r12), %ymm1, %ymm1
+vmovdqa 8960(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 10112(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 9536(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 8672(%r12), %ymm8
+vmovdqa 9536(%r12), %ymm9
+vmovdqa %ymm8, 1280(%rsp)
+vmovdqa %ymm0, 1312(%rsp)
+vmovdqa %ymm1, 1344(%rsp)
+vmovdqa %ymm7, 1376(%rsp)
+vmovdqa %ymm5, 1408(%rsp)
+vmovdqa %ymm2, 1440(%rsp)
+vmovdqa %ymm3, 1472(%rsp)
+vmovdqa %ymm9, 1504(%rsp)
+vmovdqa 10496(%r12), %ymm0
+vpsubw 10592(%r12), %ymm0, %ymm0
+vmovdqa 10880(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 10688(%r12), %ymm1, %ymm1
+vpsubw 10400(%r12), %ymm0, %ymm0
+vpaddw 10784(%r12), %ymm0, %ymm0
+vmovdqa 11072(%r12), %ymm2
+vpsubw 11168(%r12), %ymm2, %ymm2
+vmovdqa 11456(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 11264(%r12), %ymm3, %ymm3
+vpsubw 10976(%r12), %ymm2, %ymm2
+vpaddw 11360(%r12), %ymm2, %ymm2
+vmovdqa 11648(%r12), %ymm4
+vpsubw 11744(%r12), %ymm4, %ymm4
+vmovdqa 12032(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 11840(%r12), %ymm5, %ymm5
+vpsubw 11552(%r12), %ymm4, %ymm4
+vpaddw 11936(%r12), %ymm4, %ymm4
+vpsubw 10976(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 10400(%r12), %ymm1, %ymm1
+vpaddw 11552(%r12), %ymm1, %ymm1
+vmovdqa 10688(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 11840(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 11264(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 10400(%r12), %ymm8
+vmovdqa 11264(%r12), %ymm9
+vmovdqa %ymm8, 1536(%rsp)
+vmovdqa %ymm0, 1568(%rsp)
+vmovdqa %ymm1, 1600(%rsp)
+vmovdqa %ymm7, 1632(%rsp)
+vmovdqa %ymm5, 1664(%rsp)
+vmovdqa %ymm2, 1696(%rsp)
+vmovdqa %ymm3, 1728(%rsp)
+vmovdqa %ymm9, 1760(%rsp)
+vmovdqa 0(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vmovdqa 256(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm10
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 512(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm10, %ymm9
+vpaddd %ymm6, %ymm4, %ymm2
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm5, %ymm10, %ymm5
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1536(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm4
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm10, %ymm10
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm10, %ymm2, %ymm2
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpackusdw %ymm2, %ymm9, %ymm2
+vmovdqa 768(%rsp), %ymm9
+vpaddw 1024(%rsp), %ymm9, %ymm10
+vpsubw 1024(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm11, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpsllw $7, %ymm5, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vmovdqa 1280(%rsp), %ymm4
+vpsubw %ymm11, %ymm4, %ymm4
+vpmullw %ymm15, %ymm5, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpmullw %ymm12, %ymm10, %ymm4
+vpaddw %ymm4, %ymm2, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm3
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm8
+vpor %ymm8, %ymm10, %ymm10
+vpaddw 2048(%rsp), %ymm11, %ymm11
+vpaddw %ymm10, %ymm11, %ymm11
+vmovdqa %xmm3, 2048(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm3
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vpaddw 2304(%rsp), %ymm6, %ymm6
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm3, 2304(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm3
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm8
+vpor %ymm8, %ymm5, %ymm5
+vpaddw 2560(%rsp), %ymm2, %ymm2
+vpaddw %ymm5, %ymm2, %ymm2
+vmovdqa %xmm3, 2560(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 32(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 384(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 736(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1088(%rdi)
+vmovdqa 32(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm4
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm10, %ymm10
+vmovdqa 288(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm2
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 544(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm2, %ymm3
+vpaddd %ymm6, %ymm9, %ymm8
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm11, %ymm2, %ymm11
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1568(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm9
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm2, %ymm2
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm2, %ymm8, %ymm8
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm3, %ymm8
+vmovdqa 800(%rsp), %ymm3
+vpaddw 1056(%rsp), %ymm3, %ymm2
+vpsubw 1056(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm5, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpsllw $7, %ymm11, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vmovdqa 1312(%rsp), %ymm9
+vpsubw %ymm5, %ymm9, %ymm9
+vpmullw %ymm15, %ymm11, %ymm10
+vpsubw %ymm10, %ymm9, %ymm10
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpmullw %ymm12, %ymm2, %ymm9
+vpaddw %ymm9, %ymm8, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm10, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm10
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm4
+vpor %ymm4, %ymm2, %ymm2
+vpaddw 2080(%rsp), %ymm5, %ymm5
+vpaddw %ymm2, %ymm5, %ymm5
+vmovdqa %xmm10, 2080(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm10
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vpaddw 2336(%rsp), %ymm6, %ymm6
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm10, 2336(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm10
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm4
+vpor %ymm4, %ymm11, %ymm11
+vpaddw 2592(%rsp), %ymm8, %ymm8
+vpaddw %ymm11, %ymm8, %ymm8
+vmovdqa %xmm10, 2592(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 120(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 472(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 824(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1176(%rdi)
+vmovdqa 64(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm9
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm2, %ymm2
+vmovdqa 320(%rsp), %ymm3
+vpunpcklwd const0(%rip), %ymm3, %ymm8
+vpunpckhwd const0(%rip), %ymm3, %ymm3
+vmovdqa 576(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm8, %ymm10
+vpaddd %ymm6, %ymm3, %ymm4
+vpsubd %ymm9, %ymm10, %ymm10
+vpsubd %ymm2, %ymm4, %ymm4
+vpsubd %ymm5, %ymm8, %ymm5
+vpsubd %ymm6, %ymm3, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1600(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm3, %ymm10, %ymm10
+vpsubd %ymm8, %ymm4, %ymm4
+vpsrld $1, %ymm10, %ymm10
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm10, %ymm4
+vmovdqa 832(%rsp), %ymm10
+vpaddw 1088(%rsp), %ymm10, %ymm8
+vpsubw 1088(%rsp), %ymm10, %ymm10
+vpsrlw $2, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsllw $1, %ymm11, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpsllw $7, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vmovdqa 1344(%rsp), %ymm3
+vpsubw %ymm11, %ymm3, %ymm3
+vpmullw %ymm15, %ymm5, %ymm2
+vpsubw %ymm2, %ymm3, %ymm2
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpmullw %ymm12, %ymm8, %ymm3
+vpaddw %ymm3, %ymm4, %ymm3
+vpmullw %ymm12, %ymm3, %ymm3
+vpsubw %ymm3, %ymm2, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpmullw %ymm13, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm2
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm9
+vpor %ymm9, %ymm8, %ymm8
+vpaddw 2112(%rsp), %ymm11, %ymm11
+vpaddw %ymm8, %ymm11, %ymm11
+vmovdqa %xmm2, 2112(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm2
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm9
+vpor %ymm9, %ymm3, %ymm3
+vpaddw 2368(%rsp), %ymm6, %ymm6
+vpaddw %ymm3, %ymm6, %ymm6
+vmovdqa %xmm2, 2368(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm2
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm9
+vpor %ymm9, %ymm5, %ymm5
+vpaddw 2624(%rsp), %ymm4, %ymm4
+vpaddw %ymm5, %ymm4, %ymm4
+vmovdqa %xmm2, 2624(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 208(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 560(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %ymm4, 912(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 1264(%rdi)
+vmovdqa 96(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm8, %ymm8
+vmovdqa 352(%rsp), %ymm10
+vpunpcklwd const0(%rip), %ymm10, %ymm4
+vpunpckhwd const0(%rip), %ymm10, %ymm10
+vmovdqa 608(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm4, %ymm2
+vpaddd %ymm6, %ymm10, %ymm9
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm11, %ymm4, %ymm11
+vpsubd %ymm6, %ymm10, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1632(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm4
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm4, %ymm4
+vpsubd %ymm10, %ymm2, %ymm2
+vpsubd %ymm4, %ymm9, %ymm9
+vpsrld $1, %ymm2, %ymm2
+vpsrld $1, %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpackusdw %ymm9, %ymm2, %ymm9
+vmovdqa 864(%rsp), %ymm2
+vpaddw 1120(%rsp), %ymm2, %ymm4
+vpsubw 1120(%rsp), %ymm2, %ymm2
+vpsrlw $2, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsllw $1, %ymm5, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpsllw $7, %ymm11, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vmovdqa 1376(%rsp), %ymm10
+vpsubw %ymm5, %ymm10, %ymm10
+vpmullw %ymm15, %ymm11, %ymm8
+vpsubw %ymm8, %ymm10, %ymm8
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpmullw %ymm12, %ymm4, %ymm10
+vpaddw %ymm10, %ymm9, %ymm10
+vpmullw %ymm12, %ymm10, %ymm10
+vpsubw %ymm10, %ymm8, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpmullw %ymm13, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm8
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm4, %ymm4
+vpaddw 2144(%rsp), %ymm5, %ymm5
+vpaddw %ymm4, %ymm5, %ymm5
+vmovdqa %xmm8, 2144(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm8
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm10, %ymm10
+vpaddw 2400(%rsp), %ymm6, %ymm6
+vpaddw %ymm10, %ymm6, %ymm6
+vmovdqa %xmm8, 2400(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm8
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm11, %ymm11
+vpaddw 2656(%rsp), %ymm9, %ymm9
+vpaddw %ymm11, %ymm9, %ymm9
+vmovdqa %xmm8, 2656(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 296(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 648(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1000(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 1352(%rdi)
+vmovdqa 128(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm4
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm4, %ymm4
+vmovdqa 384(%rsp), %ymm2
+vpunpcklwd const0(%rip), %ymm2, %ymm9
+vpunpckhwd const0(%rip), %ymm2, %ymm2
+vmovdqa 640(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm9, %ymm8
+vpaddd %ymm6, %ymm2, %ymm3
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm5, %ymm9, %ymm5
+vpsubd %ymm6, %ymm2, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1664(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm9
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm9, %ymm9
+vpsubd %ymm2, %ymm8, %ymm8
+vpsubd %ymm9, %ymm3, %ymm3
+vpsrld $1, %ymm8, %ymm8
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm8, %ymm3
+vmovdqa 896(%rsp), %ymm8
+vpaddw 1152(%rsp), %ymm8, %ymm9
+vpsubw 1152(%rsp), %ymm8, %ymm8
+vpsrlw $2, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsllw $1, %ymm11, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpsllw $7, %ymm5, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vmovdqa 1408(%rsp), %ymm2
+vpsubw %ymm11, %ymm2, %ymm2
+vpmullw %ymm15, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpmullw %ymm12, %ymm9, %ymm2
+vpaddw %ymm2, %ymm3, %ymm2
+vpmullw %ymm12, %ymm2, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpmullw %ymm13, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vmovdqu 384(%rdi), %ymm4
+vmovdqu 736(%rdi), %ymm10
+vmovdqu 1088(%rdi), %ymm7
+vpaddw %ymm11, %ymm4, %ymm11
+vpaddw %ymm6, %ymm10, %ymm6
+vpaddw %ymm3, %ymm7, %ymm3
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm7
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm8, %ymm8
+vmovdqu 32(%rdi), %ymm10
+vpaddw 1920(%rsp), %ymm10, %ymm10
+vpaddw %ymm8, %ymm10, %ymm10
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 32(%rdi)
+vmovdqa %xmm7, 1920(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm7
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm9, %ymm9
+vpaddw 2176(%rsp), %ymm11, %ymm11
+vpaddw %ymm9, %ymm11, %ymm11
+vmovdqa %xmm7, 2176(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm7
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm2, %ymm2
+vpaddw 2432(%rsp), %ymm6, %ymm6
+vpaddw %ymm2, %ymm6, %ymm6
+vmovdqa %xmm7, 2432(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm7
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm5, %ymm5
+vpaddw 2688(%rsp), %ymm3, %ymm3
+vpaddw %ymm5, %ymm3, %ymm3
+vmovdqa %xmm7, 2688(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 384(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 736(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1088(%rdi)
+vmovdqa 160(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm9
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm9, %ymm9
+vmovdqa 416(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm3
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 672(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm3, %ymm7
+vpaddd %ymm6, %ymm8, %ymm10
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm9, %ymm10, %ymm10
+vpsubd %ymm11, %ymm3, %ymm11
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1696(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm8, %ymm7, %ymm7
+vpsubd %ymm3, %ymm10, %ymm10
+vpsrld $1, %ymm7, %ymm7
+vpsrld $1, %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpackusdw %ymm10, %ymm7, %ymm10
+vmovdqa 928(%rsp), %ymm7
+vpaddw 1184(%rsp), %ymm7, %ymm3
+vpsubw 1184(%rsp), %ymm7, %ymm7
+vpsrlw $2, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsllw $1, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpsllw $7, %ymm11, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vmovdqa 1440(%rsp), %ymm8
+vpsubw %ymm5, %ymm8, %ymm8
+vpmullw %ymm15, %ymm11, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpmullw %ymm12, %ymm3, %ymm8
+vpaddw %ymm8, %ymm10, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vmovdqu 472(%rdi), %ymm9
+vmovdqu 824(%rdi), %ymm2
+vmovdqu 1176(%rdi), %ymm4
+vpaddw %ymm5, %ymm9, %ymm5
+vpaddw %ymm6, %ymm2, %ymm6
+vpaddw %ymm10, %ymm4, %ymm10
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm4
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm7, %ymm7
+vmovdqu 120(%rdi), %ymm2
+vpaddw 1952(%rsp), %ymm2, %ymm2
+vpaddw %ymm7, %ymm2, %ymm2
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 120(%rdi)
+vmovdqa %xmm4, 1952(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm4
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm3, %ymm3
+vpaddw 2208(%rsp), %ymm5, %ymm5
+vpaddw %ymm3, %ymm5, %ymm5
+vmovdqa %xmm4, 2208(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm4
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm8, %ymm8
+vpaddw 2464(%rsp), %ymm6, %ymm6
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm4, 2464(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm4
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm11, %ymm11
+vpaddw 2720(%rsp), %ymm10, %ymm10
+vpaddw %ymm11, %ymm10, %ymm10
+vmovdqa %xmm4, 2720(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 472(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 824(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 1176(%rdi)
+vmovdqa 192(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vmovdqa 448(%rsp), %ymm7
+vpunpcklwd const0(%rip), %ymm7, %ymm10
+vpunpckhwd const0(%rip), %ymm7, %ymm7
+vmovdqa 704(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm10, %ymm4
+vpaddd %ymm6, %ymm7, %ymm2
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm5, %ymm10, %ymm5
+vpsubd %ymm6, %ymm7, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1728(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm10, %ymm10
+vpsubd %ymm7, %ymm4, %ymm4
+vpsubd %ymm10, %ymm2, %ymm2
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpackusdw %ymm2, %ymm4, %ymm2
+vmovdqa 960(%rsp), %ymm4
+vpaddw 1216(%rsp), %ymm4, %ymm10
+vpsubw 1216(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm11, %ymm7
+vpsubw %ymm7, %ymm10, %ymm7
+vpsllw $7, %ymm5, %ymm10
+vpsubw %ymm10, %ymm7, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vmovdqa 1472(%rsp), %ymm7
+vpsubw %ymm11, %ymm7, %ymm7
+vpmullw %ymm15, %ymm5, %ymm3
+vpsubw %ymm3, %ymm7, %ymm3
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpmullw %ymm12, %ymm10, %ymm7
+vpaddw %ymm7, %ymm2, %ymm7
+vpmullw %ymm12, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vmovdqu 560(%rdi), %ymm3
+vmovdqu 912(%rdi), %ymm8
+vmovdqu 1264(%rdi), %ymm9
+vpaddw %ymm11, %ymm3, %ymm11
+vpaddw %ymm6, %ymm8, %ymm6
+vpaddw %ymm2, %ymm9, %ymm2
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm9
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vmovdqu 208(%rdi), %ymm8
+vpaddw 1984(%rsp), %ymm8, %ymm8
+vpaddw %ymm4, %ymm8, %ymm8
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 208(%rdi)
+vmovdqa %xmm9, 1984(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm9
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm10, %ymm10
+vpaddw 2240(%rsp), %ymm11, %ymm11
+vpaddw %ymm10, %ymm11, %ymm11
+vmovdqa %xmm9, 2240(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm9
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm7, %ymm7
+vpaddw 2496(%rsp), %ymm6, %ymm6
+vpaddw %ymm7, %ymm6, %ymm6
+vmovdqa %xmm9, 2496(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm9
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm5, %ymm5
+vpaddw 2752(%rsp), %ymm2, %ymm2
+vpaddw %ymm5, %ymm2, %ymm2
+vmovdqa %xmm9, 2752(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 560(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 912(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 1264(%rdi)
+vmovdqa 224(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm10, %ymm10
+vmovdqa 480(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm2
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 736(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm2, %ymm9
+vpaddd %ymm6, %ymm4, %ymm8
+vpsubd %ymm7, %ymm9, %ymm9
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm11, %ymm2, %ymm11
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1760(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm2, %ymm8, %ymm8
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm9, %ymm8
+vmovdqa 992(%rsp), %ymm9
+vpaddw 1248(%rsp), %ymm9, %ymm2
+vpsubw 1248(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpsllw $7, %ymm11, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vmovdqa 1504(%rsp), %ymm4
+vpsubw %ymm5, %ymm4, %ymm4
+vpmullw %ymm15, %ymm11, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpmullw %ymm12, %ymm2, %ymm4
+vpaddw %ymm4, %ymm8, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vmovdqu 648(%rdi), %ymm10
+vmovdqu 1000(%rdi), %ymm7
+vmovdqu 1352(%rdi), %ymm3
+vpaddw %ymm5, %ymm10, %ymm5
+vpaddw %ymm6, %ymm7, %ymm6
+vpaddw %ymm8, %ymm3, %ymm8
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm3
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm9, %ymm9
+vmovdqu 296(%rdi), %ymm7
+vpaddw 2016(%rsp), %ymm7, %ymm7
+vpaddw %ymm9, %ymm7, %ymm7
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 296(%rdi)
+vmovdqa %xmm3, 2016(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm3
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm2, %ymm2
+vpaddw 2272(%rsp), %ymm5, %ymm5
+vpaddw %ymm2, %ymm5, %ymm5
+vmovdqa %xmm3, 2272(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm3
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm4, %ymm4
+vpaddw 2528(%rsp), %ymm6, %ymm6
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm3, 2528(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm3
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm11, %ymm11
+vpaddw 2784(%rsp), %ymm8, %ymm8
+vpaddw %ymm11, %ymm8, %ymm8
+vmovdqa %xmm3, 2784(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 648(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 1000(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 1352(%rdi)
+vmovdqa 160(%r12), %ymm0
+vpsubw 256(%r12), %ymm0, %ymm0
+vmovdqa 544(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 352(%r12), %ymm1, %ymm1
+vpsubw 64(%r12), %ymm0, %ymm0
+vpaddw 448(%r12), %ymm0, %ymm0
+vmovdqa 736(%r12), %ymm2
+vpsubw 832(%r12), %ymm2, %ymm2
+vmovdqa 1120(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 928(%r12), %ymm3, %ymm3
+vpsubw 640(%r12), %ymm2, %ymm2
+vpaddw 1024(%r12), %ymm2, %ymm2
+vmovdqa 1312(%r12), %ymm4
+vpsubw 1408(%r12), %ymm4, %ymm4
+vmovdqa 1696(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 1504(%r12), %ymm5, %ymm5
+vpsubw 1216(%r12), %ymm4, %ymm4
+vpaddw 1600(%r12), %ymm4, %ymm4
+vpsubw 640(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 64(%r12), %ymm1, %ymm1
+vpaddw 1216(%r12), %ymm1, %ymm1
+vmovdqa 352(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 1504(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 928(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 64(%r12), %ymm8
+vmovdqa 928(%r12), %ymm9
+vmovdqa %ymm8, 0(%rsp)
+vmovdqa %ymm0, 32(%rsp)
+vmovdqa %ymm1, 64(%rsp)
+vmovdqa %ymm7, 96(%rsp)
+vmovdqa %ymm5, 128(%rsp)
+vmovdqa %ymm2, 160(%rsp)
+vmovdqa %ymm3, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa 1888(%r12), %ymm0
+vpsubw 1984(%r12), %ymm0, %ymm0
+vmovdqa 2272(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 2080(%r12), %ymm1, %ymm1
+vpsubw 1792(%r12), %ymm0, %ymm0
+vpaddw 2176(%r12), %ymm0, %ymm0
+vmovdqa 2464(%r12), %ymm2
+vpsubw 2560(%r12), %ymm2, %ymm2
+vmovdqa 2848(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 2656(%r12), %ymm3, %ymm3
+vpsubw 2368(%r12), %ymm2, %ymm2
+vpaddw 2752(%r12), %ymm2, %ymm2
+vmovdqa 3040(%r12), %ymm4
+vpsubw 3136(%r12), %ymm4, %ymm4
+vmovdqa 3424(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 3232(%r12), %ymm5, %ymm5
+vpsubw 2944(%r12), %ymm4, %ymm4
+vpaddw 3328(%r12), %ymm4, %ymm4
+vpsubw 2368(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 1792(%r12), %ymm1, %ymm1
+vpaddw 2944(%r12), %ymm1, %ymm1
+vmovdqa 2080(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 3232(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 2656(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 1792(%r12), %ymm8
+vmovdqa 2656(%r12), %ymm9
+vmovdqa %ymm8, 256(%rsp)
+vmovdqa %ymm0, 288(%rsp)
+vmovdqa %ymm1, 320(%rsp)
+vmovdqa %ymm7, 352(%rsp)
+vmovdqa %ymm5, 384(%rsp)
+vmovdqa %ymm2, 416(%rsp)
+vmovdqa %ymm3, 448(%rsp)
+vmovdqa %ymm9, 480(%rsp)
+vmovdqa 3616(%r12), %ymm0
+vpsubw 3712(%r12), %ymm0, %ymm0
+vmovdqa 4000(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3808(%r12), %ymm1, %ymm1
+vpsubw 3520(%r12), %ymm0, %ymm0
+vpaddw 3904(%r12), %ymm0, %ymm0
+vmovdqa 4192(%r12), %ymm2
+vpsubw 4288(%r12), %ymm2, %ymm2
+vmovdqa 4576(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 4384(%r12), %ymm3, %ymm3
+vpsubw 4096(%r12), %ymm2, %ymm2
+vpaddw 4480(%r12), %ymm2, %ymm2
+vmovdqa 4768(%r12), %ymm4
+vpsubw 4864(%r12), %ymm4, %ymm4
+vmovdqa 5152(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 4960(%r12), %ymm5, %ymm5
+vpsubw 4672(%r12), %ymm4, %ymm4
+vpaddw 5056(%r12), %ymm4, %ymm4
+vpsubw 4096(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 3520(%r12), %ymm1, %ymm1
+vpaddw 4672(%r12), %ymm1, %ymm1
+vmovdqa 3808(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 4960(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 4384(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 3520(%r12), %ymm8
+vmovdqa 4384(%r12), %ymm9
+vmovdqa %ymm8, 512(%rsp)
+vmovdqa %ymm0, 544(%rsp)
+vmovdqa %ymm1, 576(%rsp)
+vmovdqa %ymm7, 608(%rsp)
+vmovdqa %ymm5, 640(%rsp)
+vmovdqa %ymm2, 672(%rsp)
+vmovdqa %ymm3, 704(%rsp)
+vmovdqa %ymm9, 736(%rsp)
+vmovdqa 5344(%r12), %ymm0
+vpsubw 5440(%r12), %ymm0, %ymm0
+vmovdqa 5728(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5536(%r12), %ymm1, %ymm1
+vpsubw 5248(%r12), %ymm0, %ymm0
+vpaddw 5632(%r12), %ymm0, %ymm0
+vmovdqa 5920(%r12), %ymm2
+vpsubw 6016(%r12), %ymm2, %ymm2
+vmovdqa 6304(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 6112(%r12), %ymm3, %ymm3
+vpsubw 5824(%r12), %ymm2, %ymm2
+vpaddw 6208(%r12), %ymm2, %ymm2
+vmovdqa 6496(%r12), %ymm4
+vpsubw 6592(%r12), %ymm4, %ymm4
+vmovdqa 6880(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 6688(%r12), %ymm5, %ymm5
+vpsubw 6400(%r12), %ymm4, %ymm4
+vpaddw 6784(%r12), %ymm4, %ymm4
+vpsubw 5824(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 5248(%r12), %ymm1, %ymm1
+vpaddw 6400(%r12), %ymm1, %ymm1
+vmovdqa 5536(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 6688(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 6112(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 5248(%r12), %ymm8
+vmovdqa 6112(%r12), %ymm9
+vmovdqa %ymm8, 768(%rsp)
+vmovdqa %ymm0, 800(%rsp)
+vmovdqa %ymm1, 832(%rsp)
+vmovdqa %ymm7, 864(%rsp)
+vmovdqa %ymm5, 896(%rsp)
+vmovdqa %ymm2, 928(%rsp)
+vmovdqa %ymm3, 960(%rsp)
+vmovdqa %ymm9, 992(%rsp)
+vmovdqa 7072(%r12), %ymm0
+vpsubw 7168(%r12), %ymm0, %ymm0
+vmovdqa 7456(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 7264(%r12), %ymm1, %ymm1
+vpsubw 6976(%r12), %ymm0, %ymm0
+vpaddw 7360(%r12), %ymm0, %ymm0
+vmovdqa 7648(%r12), %ymm2
+vpsubw 7744(%r12), %ymm2, %ymm2
+vmovdqa 8032(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 7840(%r12), %ymm3, %ymm3
+vpsubw 7552(%r12), %ymm2, %ymm2
+vpaddw 7936(%r12), %ymm2, %ymm2
+vmovdqa 8224(%r12), %ymm4
+vpsubw 8320(%r12), %ymm4, %ymm4
+vmovdqa 8608(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 8416(%r12), %ymm5, %ymm5
+vpsubw 8128(%r12), %ymm4, %ymm4
+vpaddw 8512(%r12), %ymm4, %ymm4
+vpsubw 7552(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 6976(%r12), %ymm1, %ymm1
+vpaddw 8128(%r12), %ymm1, %ymm1
+vmovdqa 7264(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 8416(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 7840(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 6976(%r12), %ymm8
+vmovdqa 7840(%r12), %ymm9
+vmovdqa %ymm8, 1024(%rsp)
+vmovdqa %ymm0, 1056(%rsp)
+vmovdqa %ymm1, 1088(%rsp)
+vmovdqa %ymm7, 1120(%rsp)
+vmovdqa %ymm5, 1152(%rsp)
+vmovdqa %ymm2, 1184(%rsp)
+vmovdqa %ymm3, 1216(%rsp)
+vmovdqa %ymm9, 1248(%rsp)
+vmovdqa 8800(%r12), %ymm0
+vpsubw 8896(%r12), %ymm0, %ymm0
+vmovdqa 9184(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 8992(%r12), %ymm1, %ymm1
+vpsubw 8704(%r12), %ymm0, %ymm0
+vpaddw 9088(%r12), %ymm0, %ymm0
+vmovdqa 9376(%r12), %ymm2
+vpsubw 9472(%r12), %ymm2, %ymm2
+vmovdqa 9760(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 9568(%r12), %ymm3, %ymm3
+vpsubw 9280(%r12), %ymm2, %ymm2
+vpaddw 9664(%r12), %ymm2, %ymm2
+vmovdqa 9952(%r12), %ymm4
+vpsubw 10048(%r12), %ymm4, %ymm4
+vmovdqa 10336(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 10144(%r12), %ymm5, %ymm5
+vpsubw 9856(%r12), %ymm4, %ymm4
+vpaddw 10240(%r12), %ymm4, %ymm4
+vpsubw 9280(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 8704(%r12), %ymm1, %ymm1
+vpaddw 9856(%r12), %ymm1, %ymm1
+vmovdqa 8992(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 10144(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 9568(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 8704(%r12), %ymm8
+vmovdqa 9568(%r12), %ymm9
+vmovdqa %ymm8, 1280(%rsp)
+vmovdqa %ymm0, 1312(%rsp)
+vmovdqa %ymm1, 1344(%rsp)
+vmovdqa %ymm7, 1376(%rsp)
+vmovdqa %ymm5, 1408(%rsp)
+vmovdqa %ymm2, 1440(%rsp)
+vmovdqa %ymm3, 1472(%rsp)
+vmovdqa %ymm9, 1504(%rsp)
+vmovdqa 10528(%r12), %ymm0
+vpsubw 10624(%r12), %ymm0, %ymm0
+vmovdqa 10912(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 10720(%r12), %ymm1, %ymm1
+vpsubw 10432(%r12), %ymm0, %ymm0
+vpaddw 10816(%r12), %ymm0, %ymm0
+vmovdqa 11104(%r12), %ymm2
+vpsubw 11200(%r12), %ymm2, %ymm2
+vmovdqa 11488(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 11296(%r12), %ymm3, %ymm3
+vpsubw 11008(%r12), %ymm2, %ymm2
+vpaddw 11392(%r12), %ymm2, %ymm2
+vmovdqa 11680(%r12), %ymm4
+vpsubw 11776(%r12), %ymm4, %ymm4
+vmovdqa 12064(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 11872(%r12), %ymm5, %ymm5
+vpsubw 11584(%r12), %ymm4, %ymm4
+vpaddw 11968(%r12), %ymm4, %ymm4
+vpsubw 11008(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 10432(%r12), %ymm1, %ymm1
+vpaddw 11584(%r12), %ymm1, %ymm1
+vmovdqa 10720(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 11872(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 11296(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 10432(%r12), %ymm8
+vmovdqa 11296(%r12), %ymm9
+vmovdqa %ymm8, 1536(%rsp)
+vmovdqa %ymm0, 1568(%rsp)
+vmovdqa %ymm1, 1600(%rsp)
+vmovdqa %ymm7, 1632(%rsp)
+vmovdqa %ymm5, 1664(%rsp)
+vmovdqa %ymm2, 1696(%rsp)
+vmovdqa %ymm3, 1728(%rsp)
+vmovdqa %ymm9, 1760(%rsp)
+vmovdqa 0(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vmovdqa 256(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm8
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 512(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm8, %ymm3
+vpaddd %ymm6, %ymm9, %ymm7
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm5, %ymm8, %ymm5
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1536(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm8, %ymm7, %ymm7
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpackusdw %ymm7, %ymm3, %ymm7
+vmovdqa 768(%rsp), %ymm3
+vpaddw 1024(%rsp), %ymm3, %ymm8
+vpsubw 1024(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm11, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpsllw $7, %ymm5, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vmovdqa 1280(%rsp), %ymm9
+vpsubw %ymm11, %ymm9, %ymm9
+vpmullw %ymm15, %ymm5, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpmullw %ymm12, %ymm8, %ymm9
+vpaddw %ymm9, %ymm7, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_4_3_1(%rip), %ymm8, %ymm2
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm4
+vpor %ymm4, %ymm8, %ymm8
+vpaddw 2048(%rsp), %ymm11, %ymm11
+vpaddw %ymm8, %ymm11, %ymm11
+vmovdqa %xmm2, 2048(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vpaddw 2304(%rsp), %ymm6, %ymm6
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm2, 2304(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm2
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm4
+vpor %ymm4, %ymm5, %ymm5
+vpaddw 2560(%rsp), %ymm7, %ymm7
+vpaddw %ymm5, %ymm7, %ymm7
+vmovdqa %xmm2, 2560(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 64(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 80(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 416(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 432(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 768(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vmovq %xmm7, 784(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %xmm3, 1120(%rdi)
+vextracti128 $1, %ymm3, %xmm3
+vmovq %xmm3, 1136(%rdi)
+vmovdqa 32(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm8, %ymm8
+vmovdqa 288(%rsp), %ymm3
+vpunpcklwd const0(%rip), %ymm3, %ymm7
+vpunpckhwd const0(%rip), %ymm3, %ymm3
+vmovdqa 544(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm7, %ymm2
+vpaddd %ymm6, %ymm3, %ymm4
+vpsubd %ymm9, %ymm2, %ymm2
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm11, %ymm7, %ymm11
+vpsubd %ymm6, %ymm3, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1568(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm3
+vpunpckhwd const0(%rip), %ymm11, %ymm7
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm7, %ymm4, %ymm4
+vpsrld $1, %ymm2, %ymm2
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm2, %ymm4
+vmovdqa 800(%rsp), %ymm2
+vpaddw 1056(%rsp), %ymm2, %ymm7
+vpsubw 1056(%rsp), %ymm2, %ymm2
+vpsrlw $2, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsllw $1, %ymm5, %ymm3
+vpsubw %ymm3, %ymm7, %ymm3
+vpsllw $7, %ymm11, %ymm7
+vpsubw %ymm7, %ymm3, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vmovdqa 1312(%rsp), %ymm3
+vpsubw %ymm5, %ymm3, %ymm3
+vpmullw %ymm15, %ymm11, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpmullw %ymm12, %ymm7, %ymm3
+vpaddw %ymm3, %ymm4, %ymm3
+vpmullw %ymm12, %ymm3, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw %ymm3, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpmullw %ymm13, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_4_3_1(%rip), %ymm7, %ymm8
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $139, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm9
+vpor %ymm9, %ymm7, %ymm7
+vpaddw 2080(%rsp), %ymm5, %ymm5
+vpaddw %ymm7, %ymm5, %ymm5
+vmovdqa %xmm8, 2080(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_4_3_1(%rip), %ymm3, %ymm8
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $139, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm9
+vpor %ymm9, %ymm3, %ymm3
+vpaddw 2336(%rsp), %ymm6, %ymm6
+vpaddw %ymm3, %ymm6, %ymm6
+vmovdqa %xmm8, 2336(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm8
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm9
+vpor %ymm9, %ymm11, %ymm11
+vpaddw 2592(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vmovdqa %xmm8, 2592(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 152(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 168(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 504(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 520(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %xmm4, 856(%rdi)
+vextracti128 $1, %ymm4, %xmm4
+vmovq %xmm4, 872(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %xmm2, 1208(%rdi)
+vextracti128 $1, %ymm2, %xmm2
+vmovq %xmm2, 1224(%rdi)
+vmovdqa 64(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm3
+vpunpckhwd const0(%rip), %ymm11, %ymm7
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm7, %ymm7
+vmovdqa 320(%rsp), %ymm2
+vpunpcklwd const0(%rip), %ymm2, %ymm4
+vpunpckhwd const0(%rip), %ymm2, %ymm2
+vmovdqa 576(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm4, %ymm8
+vpaddd %ymm6, %ymm2, %ymm9
+vpsubd %ymm3, %ymm8, %ymm8
+vpsubd %ymm7, %ymm9, %ymm9
+vpsubd %ymm5, %ymm4, %ymm5
+vpsubd %ymm6, %ymm2, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1600(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpsubd %ymm2, %ymm8, %ymm8
+vpsubd %ymm4, %ymm9, %ymm9
+vpsrld $1, %ymm8, %ymm8
+vpsrld $1, %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpackusdw %ymm9, %ymm8, %ymm9
+vmovdqa 832(%rsp), %ymm8
+vpaddw 1088(%rsp), %ymm8, %ymm4
+vpsubw 1088(%rsp), %ymm8, %ymm8
+vpsrlw $2, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsllw $1, %ymm11, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpsllw $7, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vmovdqa 1344(%rsp), %ymm2
+vpsubw %ymm11, %ymm2, %ymm2
+vpmullw %ymm15, %ymm5, %ymm7
+vpsubw %ymm7, %ymm2, %ymm7
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpmullw %ymm12, %ymm4, %ymm2
+vpaddw %ymm2, %ymm9, %ymm2
+vpmullw %ymm12, %ymm2, %ymm2
+vpsubw %ymm2, %ymm7, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpmullw %ymm13, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_4_3_1(%rip), %ymm4, %ymm7
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $139, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm3
+vpor %ymm3, %ymm4, %ymm4
+vpaddw 2112(%rsp), %ymm11, %ymm11
+vpaddw %ymm4, %ymm11, %ymm11
+vmovdqa %xmm7, 2112(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_4_3_1(%rip), %ymm2, %ymm7
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $139, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm3
+vpor %ymm3, %ymm2, %ymm2
+vpaddw 2368(%rsp), %ymm6, %ymm6
+vpaddw %ymm2, %ymm6, %ymm6
+vmovdqa %xmm7, 2368(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm7
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm3
+vpor %ymm3, %ymm5, %ymm5
+vpaddw 2624(%rsp), %ymm9, %ymm9
+vpaddw %ymm5, %ymm9, %ymm9
+vmovdqa %xmm7, 2624(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 240(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 256(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 592(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 608(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %xmm9, 944(%rdi)
+vextracti128 $1, %ymm9, %xmm9
+vmovq %xmm9, 960(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %xmm8, 1296(%rdi)
+vextracti128 $1, %ymm8, %xmm8
+vmovq %xmm8, 1312(%rdi)
+vmovdqa 96(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm4, %ymm4
+vmovdqa 352(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm9
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 608(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm9, %ymm7
+vpaddd %ymm6, %ymm8, %ymm3
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm11, %ymm9, %ymm11
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1632(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm9, %ymm9
+vpsubd %ymm8, %ymm7, %ymm7
+vpsubd %ymm9, %ymm3, %ymm3
+vpsrld $1, %ymm7, %ymm7
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm7, %ymm3
+vmovdqa 864(%rsp), %ymm7
+vpaddw 1120(%rsp), %ymm7, %ymm9
+vpsubw 1120(%rsp), %ymm7, %ymm7
+vpsrlw $2, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsllw $1, %ymm5, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpsllw $7, %ymm11, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vmovdqa 1376(%rsp), %ymm8
+vpsubw %ymm5, %ymm8, %ymm8
+vpmullw %ymm15, %ymm11, %ymm4
+vpsubw %ymm4, %ymm8, %ymm4
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpmullw %ymm12, %ymm9, %ymm8
+vpaddw %ymm8, %ymm3, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm4
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm9, %ymm9
+vpaddw 2144(%rsp), %ymm5, %ymm5
+vpaddw %ymm9, %ymm5, %ymm5
+vmovdqa %xmm4, 2144(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_4_3_1(%rip), %ymm8, %ymm4
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $139, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm8, %ymm8
+vpaddw 2400(%rsp), %ymm6, %ymm6
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm4, 2400(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm4
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm11, %ymm11
+vpaddw 2656(%rsp), %ymm3, %ymm3
+vpaddw %ymm11, %ymm3, %ymm3
+vmovdqa %xmm4, 2656(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 328(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 344(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm5, %ymm5
+vmovdqa %xmm5, 1792(%rsp)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 680(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 696(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm6, %ymm6
+vmovdqa %xmm6, 1824(%rsp)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %xmm3, 1032(%rdi)
+vextracti128 $1, %ymm3, %xmm3
+vmovq %xmm3, 1048(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm3, %ymm3
+vmovdqa %xmm3, 1856(%rsp)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 1384(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vpextrw $0, %xmm7, 1400(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm7, %ymm7
+vmovdqa %xmm7, 1888(%rsp)
+vmovdqa 128(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm9, %ymm9
+vmovdqa 384(%rsp), %ymm7
+vpunpcklwd const0(%rip), %ymm7, %ymm3
+vpunpckhwd const0(%rip), %ymm7, %ymm7
+vmovdqa 640(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm3, %ymm4
+vpaddd %ymm6, %ymm7, %ymm2
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm9, %ymm2, %ymm2
+vpsubd %ymm5, %ymm3, %ymm5
+vpsubd %ymm6, %ymm7, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1664(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm3
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm7, %ymm4, %ymm4
+vpsubd %ymm3, %ymm2, %ymm2
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpackusdw %ymm2, %ymm4, %ymm2
+vmovdqa 896(%rsp), %ymm4
+vpaddw 1152(%rsp), %ymm4, %ymm3
+vpsubw 1152(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm11, %ymm7
+vpsubw %ymm7, %ymm3, %ymm7
+vpsllw $7, %ymm5, %ymm3
+vpsubw %ymm3, %ymm7, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vmovdqa 1408(%rsp), %ymm7
+vpsubw %ymm11, %ymm7, %ymm7
+vpmullw %ymm15, %ymm5, %ymm9
+vpsubw %ymm9, %ymm7, %ymm9
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm2, %ymm2
+vpmullw %ymm12, %ymm3, %ymm7
+vpaddw %ymm7, %ymm2, %ymm7
+vpmullw %ymm12, %ymm7, %ymm7
+vpsubw %ymm7, %ymm9, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vmovdqu 416(%rdi), %ymm9
+vmovdqu 768(%rdi), %ymm8
+vmovdqu 1120(%rdi), %ymm10
+vpaddw %ymm11, %ymm9, %ymm11
+vpaddw %ymm6, %ymm8, %ymm6
+vpaddw %ymm2, %ymm10, %ymm2
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_4_3_1(%rip), %ymm4, %ymm10
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vmovdqu 64(%rdi), %ymm8
+vpaddw 1920(%rsp), %ymm8, %ymm8
+vpaddw %ymm4, %ymm8, %ymm8
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %xmm8, 64(%rdi)
+vextracti128 $1, %ymm8, %xmm8
+vmovq %xmm8, 80(%rdi)
+vmovdqa %xmm10, 1920(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_4_3_1(%rip), %ymm3, %ymm10
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm3, %ymm3
+vpaddw 2176(%rsp), %ymm11, %ymm11
+vpaddw %ymm3, %ymm11, %ymm11
+vmovdqa %xmm10, 2176(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_4_3_1(%rip), %ymm7, %ymm10
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm7, %ymm7
+vpaddw 2432(%rsp), %ymm6, %ymm6
+vpaddw %ymm7, %ymm6, %ymm6
+vmovdqa %xmm10, 2432(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm10
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm5, %ymm5
+vpaddw 2688(%rsp), %ymm2, %ymm2
+vpaddw %ymm5, %ymm2, %ymm2
+vmovdqa %xmm10, 2688(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 416(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 432(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 768(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 784(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %xmm2, 1120(%rdi)
+vextracti128 $1, %ymm2, %xmm2
+vmovq %xmm2, 1136(%rdi)
+vmovdqa 160(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm3
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm3, %ymm3
+vmovdqa 416(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm2
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 672(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm2, %ymm10
+vpaddd %ymm6, %ymm4, %ymm8
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm3, %ymm8, %ymm8
+vpsubd %ymm11, %ymm2, %ymm11
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1696(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpsubd %ymm4, %ymm10, %ymm10
+vpsubd %ymm2, %ymm8, %ymm8
+vpsrld $1, %ymm10, %ymm10
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm10, %ymm8
+vmovdqa 928(%rsp), %ymm10
+vpaddw 1184(%rsp), %ymm10, %ymm2
+vpsubw 1184(%rsp), %ymm10, %ymm10
+vpsrlw $2, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsllw $1, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpsllw $7, %ymm11, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vmovdqa 1440(%rsp), %ymm4
+vpsubw %ymm5, %ymm4, %ymm4
+vpmullw %ymm15, %ymm11, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpmullw %ymm12, %ymm2, %ymm4
+vpaddw %ymm4, %ymm8, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm10, %ymm4, %ymm4
+vpsubw %ymm4, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vmovdqu 504(%rdi), %ymm3
+vmovdqu 856(%rdi), %ymm7
+vmovdqu 1208(%rdi), %ymm9
+vpaddw %ymm5, %ymm3, %ymm5
+vpaddw %ymm6, %ymm7, %ymm6
+vpaddw %ymm8, %ymm9, %ymm8
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_4_3_1(%rip), %ymm10, %ymm9
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm10, %ymm10
+vmovdqu 152(%rdi), %ymm7
+vpaddw 1952(%rsp), %ymm7, %ymm7
+vpaddw %ymm10, %ymm7, %ymm7
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 152(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vmovq %xmm7, 168(%rdi)
+vmovdqa %xmm9, 1952(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_4_3_1(%rip), %ymm2, %ymm9
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm2, %ymm2
+vpaddw 2208(%rsp), %ymm5, %ymm5
+vpaddw %ymm2, %ymm5, %ymm5
+vmovdqa %xmm9, 2208(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_4_3_1(%rip), %ymm4, %ymm9
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm4, %ymm4
+vpaddw 2464(%rsp), %ymm6, %ymm6
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm9, 2464(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm9
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm11, %ymm11
+vpaddw 2720(%rsp), %ymm8, %ymm8
+vpaddw %ymm11, %ymm8, %ymm8
+vmovdqa %xmm9, 2720(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 504(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 520(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 856(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 872(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %xmm8, 1208(%rdi)
+vextracti128 $1, %ymm8, %xmm8
+vmovq %xmm8, 1224(%rdi)
+vmovdqa 192(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vmovdqa 448(%rsp), %ymm10
+vpunpcklwd const0(%rip), %ymm10, %ymm8
+vpunpckhwd const0(%rip), %ymm10, %ymm10
+vmovdqa 704(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm8, %ymm9
+vpaddd %ymm6, %ymm10, %ymm7
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm5, %ymm8, %ymm5
+vpsubd %ymm6, %ymm10, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1728(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm10
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm10, %ymm9, %ymm9
+vpsubd %ymm8, %ymm7, %ymm7
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpackusdw %ymm7, %ymm9, %ymm7
+vmovdqa 960(%rsp), %ymm9
+vpaddw 1216(%rsp), %ymm9, %ymm8
+vpsubw 1216(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm11, %ymm10
+vpsubw %ymm10, %ymm8, %ymm10
+vpsllw $7, %ymm5, %ymm8
+vpsubw %ymm8, %ymm10, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vmovdqa 1472(%rsp), %ymm10
+vpsubw %ymm11, %ymm10, %ymm10
+vpmullw %ymm15, %ymm5, %ymm2
+vpsubw %ymm2, %ymm10, %ymm2
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpmullw %ymm12, %ymm8, %ymm10
+vpaddw %ymm10, %ymm7, %ymm10
+vpmullw %ymm12, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm9, %ymm10, %ymm10
+vpsubw %ymm10, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vmovdqu 592(%rdi), %ymm2
+vmovdqu 944(%rdi), %ymm4
+vmovdqu 1296(%rdi), %ymm3
+vpaddw %ymm11, %ymm2, %ymm11
+vpaddw %ymm6, %ymm4, %ymm6
+vpaddw %ymm7, %ymm3, %ymm7
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm3
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vmovdqu 240(%rdi), %ymm4
+vpaddw 1984(%rsp), %ymm4, %ymm4
+vpaddw %ymm9, %ymm4, %ymm4
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %xmm4, 240(%rdi)
+vextracti128 $1, %ymm4, %xmm4
+vmovq %xmm4, 256(%rdi)
+vmovdqa %xmm3, 1984(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_4_3_1(%rip), %ymm8, %ymm3
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm8, %ymm8
+vpaddw 2240(%rsp), %ymm11, %ymm11
+vpaddw %ymm8, %ymm11, %ymm11
+vmovdqa %xmm3, 2240(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_4_3_1(%rip), %ymm10, %ymm3
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm10, %ymm10
+vpaddw 2496(%rsp), %ymm6, %ymm6
+vpaddw %ymm10, %ymm6, %ymm6
+vmovdqa %xmm3, 2496(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm3
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm5, %ymm5
+vpaddw 2752(%rsp), %ymm7, %ymm7
+vpaddw %ymm5, %ymm7, %ymm7
+vmovdqa %xmm3, 2752(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 592(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 608(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 944(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 960(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 1296(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vmovq %xmm7, 1312(%rdi)
+vmovdqa 224(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm10
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm8, %ymm8
+vmovdqa 480(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm7
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 736(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm7, %ymm3
+vpaddd %ymm6, %ymm9, %ymm4
+vpsubd %ymm10, %ymm3, %ymm3
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm11, %ymm7, %ymm11
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1760(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm9
+vpunpckhwd const0(%rip), %ymm11, %ymm7
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm7, %ymm4, %ymm4
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm3, %ymm4
+vmovdqa 992(%rsp), %ymm3
+vpaddw 1248(%rsp), %ymm3, %ymm7
+vpsubw 1248(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm5, %ymm9
+vpsubw %ymm9, %ymm7, %ymm9
+vpsllw $7, %ymm11, %ymm7
+vpsubw %ymm7, %ymm9, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vmovdqa 1504(%rsp), %ymm9
+vpsubw %ymm5, %ymm9, %ymm9
+vpmullw %ymm15, %ymm11, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpmullw %ymm12, %ymm7, %ymm9
+vpaddw %ymm9, %ymm4, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vextracti128 $1, %ymm4, %xmm8
+vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8
+vmovdqa %ymm8, 2816(%rsp)
+vextracti128 $1, %ymm3, %xmm8
+vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8
+vmovdqa %ymm8, 2848(%rsp)
+vextracti128 $1, %ymm7, %xmm8
+vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8
+vmovdqa %ymm8, 2880(%rsp)
+vmovdqu 680(%rdi), %ymm8
+vmovdqu 1032(%rdi), %ymm10
+vmovdqu 1384(%rdi), %ymm2
+vpaddw %ymm5, %ymm8, %ymm5
+vpaddw %ymm6, %ymm10, %ymm6
+vpaddw %ymm4, %ymm2, %ymm4
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_4_3_1(%rip), %ymm3, %ymm2
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm3, %ymm3
+vmovdqu 328(%rdi), %ymm10
+vpaddw 2016(%rsp), %ymm10, %ymm10
+vpaddw %ymm3, %ymm10, %ymm10
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %xmm10, 328(%rdi)
+vextracti128 $1, %ymm10, %xmm10
+vmovq %xmm10, 344(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm10, %ymm10
+vmovdqa %xmm10, 1792(%rsp)
+vmovdqa %xmm2, 2016(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_4_3_1(%rip), %ymm7, %ymm2
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm7, %ymm7
+vpaddw 2272(%rsp), %ymm5, %ymm5
+vpaddw %ymm7, %ymm5, %ymm5
+vmovdqa %xmm2, 2272(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm9, %ymm9
+vpaddw 2528(%rsp), %ymm6, %ymm6
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm2, 2528(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm2
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm11, %ymm11
+vpaddw 2784(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vmovdqa %xmm2, 2784(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 680(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 696(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 1032(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 1048(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %xmm4, 1384(%rdi)
+vextracti128 $1, %ymm4, %xmm4
+vpextrw $0, %xmm4, 1400(%rdi)
+vmovdqu 0(%rdi), %ymm11
+vpaddw 1888(%rsp), %ymm11, %ymm11
+vpaddw 2816(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 0(%rdi)
+vmovdqu 352(%rdi), %ymm11
+vpaddw 2528(%rsp), %ymm11, %ymm11
+vpaddw 2848(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 352(%rdi)
+vmovdqu 704(%rdi), %ymm11
+vpaddw 2784(%rsp), %ymm11, %ymm11
+vpaddw 2880(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 704(%rdi)
+vmovdqu 88(%rdi), %ymm11
+vpaddw 2048(%rsp), %ymm11, %ymm11
+vpaddw 1920(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 88(%rdi)
+vmovdqu 440(%rdi), %ymm11
+vpaddw 2304(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 440(%rdi)
+vmovdqu 792(%rdi), %ymm11
+vpaddw 2560(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 792(%rdi)
+vmovdqu 176(%rdi), %ymm11
+vpaddw 2080(%rsp), %ymm11, %ymm11
+vpaddw 1952(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 176(%rdi)
+vmovdqu 528(%rdi), %ymm11
+vpaddw 2336(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 528(%rdi)
+vmovdqu 880(%rdi), %ymm11
+vpaddw 2592(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 880(%rdi)
+vmovdqu 264(%rdi), %ymm11
+vpaddw 2112(%rsp), %ymm11, %ymm11
+vpaddw 1984(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 264(%rdi)
+vmovdqu 616(%rdi), %ymm11
+vpaddw 2368(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 616(%rdi)
+vmovdqu 968(%rdi), %ymm11
+vpaddw 2624(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 968(%rdi)
+vmovdqu 352(%rdi), %ymm11
+vpaddw 2144(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 352(%rdi)
+vmovdqu 704(%rdi), %ymm11
+vpaddw 2400(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 704(%rdi)
+vmovdqu 1056(%rdi), %ymm11
+vpaddw 2656(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1056(%rdi)
+vmovdqu 440(%rdi), %ymm11
+vpaddw 2176(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 440(%rdi)
+vmovdqu 792(%rdi), %ymm11
+vpaddw 2432(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 792(%rdi)
+vmovdqu 1144(%rdi), %ymm11
+vpaddw 2688(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1144(%rdi)
+vmovdqu 528(%rdi), %ymm11
+vpaddw 2208(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 528(%rdi)
+vmovdqu 880(%rdi), %ymm11
+vpaddw 2464(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 880(%rdi)
+vmovdqu 1232(%rdi), %ymm11
+vpaddw 2720(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1232(%rdi)
+vmovdqu 616(%rdi), %ymm11
+vpaddw 2240(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 616(%rdi)
+vmovdqu 968(%rdi), %ymm11
+vpaddw 2496(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 968(%rdi)
+vmovdqu 1320(%rdi), %ymm11
+vpaddw 2752(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1320(%rdi)
+mov %r8, %rsp
+pop %r12
+pop %rbp
+ret
+.cfi_endproc
+
+#endif
diff --git a/src/crypto/hrss/hrss.c b/src/crypto/hrss/hrss.c
new file mode 100644
index 00000000..dd3f979c
--- /dev/null
+++ b/src/crypto/hrss/hrss.c
@@ -0,0 +1,2237 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/hrss.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <openssl/bn.h>
+#include <openssl/cpu.h>
+#include <openssl/hmac.h>
+#include <openssl/mem.h>
+#include <openssl/sha.h>
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+#include <emmintrin.h>
+#endif
+
+#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+ (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+#endif
+
+#if defined(_MSC_VER)
+#define RESTRICT
+#else
+#define RESTRICT restrict
+#endif
+
+#include "../internal.h"
+#include "internal.h"
+
+// This is an implementation of [HRSS], but with a KEM transformation based on
+// [SXY]. The primary references are:
+
+// HRSS: https://eprint.iacr.org/2017/667.pdf
+// HRSSNIST:
+// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip
+// SXY: https://eprint.iacr.org/2017/1005.pdf
+// NTRUTN14:
+// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf
+
+
+// Vector operations.
+//
+// A couple of functions in this file can use vector operations to meaningful
+// effect. If we're building for a target that has a supported vector unit,
+// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a
+// 128-bit vector. The following functions abstract over the differences between
+// NEON and SSE2 for implementing some vector operations.
+
+// TODO: MSVC can likely also be made to work with vector operations.
+#if (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) && \
+ (defined(__clang__) || !defined(_MSC_VER))
+
+#define HRSS_HAVE_VECTOR_UNIT
+typedef __m128i vec_t;
+
+// vec_capable returns one iff the current platform supports SSE2.
+static int vec_capable(void) {
+#if defined(__SSE2__)
+ return 1;
+#else
+ int has_sse2 = (OPENSSL_ia32cap_P[0] & (1 << 26)) != 0;
+ return has_sse2;
+#endif
+}
+
+// vec_add performs a pair-wise addition of four uint16s from |a| and |b|.
+static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
+
+// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|.
+static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); }
+
+// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting
+// vector.
+static inline vec_t vec_mul(vec_t a, uint16_t b) {
+ return _mm_mullo_epi16(a, _mm_set1_epi16(b));
+}
+
+// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and
+// returns the resulting vector.
+static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
+ return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c)));
+}
+
+// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16.
+static inline void vec3_rshift_word(vec_t v[3]) {
+ // Intel's left and right shifting is backwards compared to the order in
+ // memory because they're based on little-endian order of words (and not just
+ // bytes). So the shifts in this function will be backwards from what one
+ // might expect.
+ const __m128i carry0 = _mm_srli_si128(v[0], 14);
+ v[0] = _mm_slli_si128(v[0], 2);
+
+ const __m128i carry1 = _mm_srli_si128(v[1], 14);
+ v[1] = _mm_slli_si128(v[1], 2);
+ v[1] |= carry0;
+
+ v[2] = _mm_slli_si128(v[2], 2);
+ v[2] |= carry1;
+}
+
+// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16.
+static inline void vec4_rshift_word(vec_t v[4]) {
+ // Intel's left and right shifting is backwards compared to the order in
+ // memory because they're based on little-endian order of words (and not just
+ // bytes). So the shifts in this function will be backwards from what one
+ // might expect.
+ const __m128i carry0 = _mm_srli_si128(v[0], 14);
+ v[0] = _mm_slli_si128(v[0], 2);
+
+ const __m128i carry1 = _mm_srli_si128(v[1], 14);
+ v[1] = _mm_slli_si128(v[1], 2);
+ v[1] |= carry0;
+
+ const __m128i carry2 = _mm_srli_si128(v[2], 14);
+ v[2] = _mm_slli_si128(v[2], 2);
+ v[2] |= carry1;
+
+ v[3] = _mm_slli_si128(v[3], 2);
+ v[3] |= carry2;
+}
+
+// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first
+// five from |right|, and returns the resulting vector.
+static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
+ return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6);
+}
+
+// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one
+// bit.
+static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
+ vec_t carry_s = {0};
+ vec_t carry_a = {0};
+
+ for (int i = 0; i < 6; i++) {
+ vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63);
+ a_s[i] = _mm_slli_epi64(a_s[i], 1);
+ a_s[i] |= _mm_slli_si128(next_carry_s, 8);
+ a_s[i] |= carry_s;
+ carry_s = _mm_srli_si128(next_carry_s, 8);
+
+ vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63);
+ a_a[i] = _mm_slli_epi64(a_a[i], 1);
+ a_a[i] |= _mm_slli_si128(next_carry_a, 8);
+ a_a[i] |= carry_a;
+ carry_a = _mm_srli_si128(next_carry_a, 8);
+ }
+}
+
+// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one
+// bit.
+static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
+ vec_t carry_s = {0};
+ vec_t carry_a = {0};
+
+ for (int i = 5; i >= 0; i--) {
+ const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63);
+ a_s[i] = _mm_srli_epi64(a_s[i], 1);
+ a_s[i] |= _mm_srli_si128(next_carry_s, 8);
+ a_s[i] |= carry_s;
+ carry_s = _mm_slli_si128(next_carry_s, 8);
+
+ const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63);
+ a_a[i] = _mm_srli_epi64(a_a[i], 1);
+ a_a[i] |= _mm_srli_si128(next_carry_a, 8);
+ a_a[i] |= carry_a;
+ carry_a = _mm_slli_si128(next_carry_a, 8);
+ }
+}
+
+// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in
+// a vector and returns the result.
+static inline vec_t vec_broadcast_bit(vec_t a) {
+ return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31),
+ 0b01010101);
+}
+
+// vec_broadcast_bit15 duplicates the most-significant bit of the first word in
+// |a| to all bits in a vector and returns the result.
+static inline vec_t vec_broadcast_bit15(vec_t a) {
+ return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63 - 15), 31),
+ 0b01010101);
+}
+
+// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the
+// compiler requires that |i| be a compile-time constant.)
+#define vec_get_word(v, i) _mm_extract_epi16(v, i)
+
+#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+ (defined(__ARM_NEON__) || defined(__ARM_NEON))
+
+#define HRSS_HAVE_VECTOR_UNIT
+typedef uint16x8_t vec_t;
+
+// These functions perform the same actions as the SSE2 function of the same
+// name, above.
+
+static int vec_capable(void) { return CRYPTO_is_NEON_capable(); }
+
+static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; }
+
+static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; }
+
+static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); }
+
+static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
+ return vmlaq_n_u16(a, b, c);
+}
+
+static inline void vec3_rshift_word(vec_t v[3]) {
+ const uint16x8_t kZero = {0};
+ v[2] = vextq_u16(v[1], v[2], 7);
+ v[1] = vextq_u16(v[0], v[1], 7);
+ v[0] = vextq_u16(kZero, v[0], 7);
+}
+
+static inline void vec4_rshift_word(vec_t v[4]) {
+ const uint16x8_t kZero = {0};
+ v[3] = vextq_u16(v[2], v[3], 7);
+ v[2] = vextq_u16(v[1], v[2], 7);
+ v[1] = vextq_u16(v[0], v[1], 7);
+ v[0] = vextq_u16(kZero, v[0], 7);
+}
+
+static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
+ return vextq_u16(left, right, 5);
+}
+
+static inline uint16_t vec_get_word(vec_t v, unsigned i) {
+ return v[i];
+}
+
+#if !defined(OPENSSL_AARCH64)
+
+static inline vec_t vec_broadcast_bit(vec_t a) {
+ a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15);
+ return vdupq_lane_u16(vget_low_u16(a), 0);
+}
+
+static inline vec_t vec_broadcast_bit15(vec_t a) {
+ a = (vec_t)vshrq_n_s16((int16x8_t)a, 15);
+ return vdupq_lane_u16(vget_low_u16(a), 0);
+}
+
+static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
+ vec_t carry_s = {0};
+ vec_t carry_a = {0};
+ const vec_t kZero = {0};
+
+ for (int i = 0; i < 6; i++) {
+ vec_t next_carry_s = a_s[i] >> 15;
+ a_s[i] <<= 1;
+ a_s[i] |= vextq_u16(kZero, next_carry_s, 7);
+ a_s[i] |= carry_s;
+ carry_s = vextq_u16(next_carry_s, kZero, 7);
+
+ vec_t next_carry_a = a_a[i] >> 15;
+ a_a[i] <<= 1;
+ a_a[i] |= vextq_u16(kZero, next_carry_a, 7);
+ a_a[i] |= carry_a;
+ carry_a = vextq_u16(next_carry_a, kZero, 7);
+ }
+}
+
+static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
+ vec_t carry_s = {0};
+ vec_t carry_a = {0};
+ const vec_t kZero = {0};
+
+ for (int i = 5; i >= 0; i--) {
+ vec_t next_carry_s = a_s[i] << 15;
+ a_s[i] >>= 1;
+ a_s[i] |= vextq_u16(next_carry_s, kZero, 1);
+ a_s[i] |= carry_s;
+ carry_s = vextq_u16(kZero, next_carry_s, 1);
+
+ vec_t next_carry_a = a_a[i] << 15;
+ a_a[i] >>= 1;
+ a_a[i] |= vextq_u16(next_carry_a, kZero, 1);
+ a_a[i] |= carry_a;
+ carry_a = vextq_u16(kZero, next_carry_a, 1);
+ }
+}
+
+#endif // !OPENSSL_AARCH64
+
+#endif // (ARM || AARCH64) && NEON
+
+// Polynomials in this scheme have N terms.
+// #define N 701
+
+// Underlying data types and arithmetic operations.
+// ------------------------------------------------
+
+// Binary polynomials.
+
+// poly2 represents a degree-N polynomial over GF(2). The words are in little-
+// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The
+// final word is only partially used since N is not a multiple of the word size.
+
+// Defined in internal.h:
+// struct poly2 {
+// crypto_word_t v[WORDS_PER_POLY];
+// };
+
+OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) {
+ const uint8_t *in = (const uint8_t *)void_in;
+ for (size_t i = 0; i < len; i++) {
+ printf("%02x", in[i]);
+ }
+ printf("\n");
+}
+
+static void poly2_zero(struct poly2 *p) {
+ OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY);
+}
+
+// poly2_cmov sets |out| to |in| iff |mov| is all ones.
+static void poly2_cmov(struct poly2 *out, const struct poly2 *in,
+ crypto_word_t mov) {
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ out->v[i] = (out->v[i] & ~mov) | (in->v[i] & mov);
+ }
+}
+
+// poly2_rotr_words performs a right-rotate on |in|, writing the result to
+// |out|. The shift count, |bits|, must be a non-zero multiple of the word size.
+static void poly2_rotr_words(struct poly2 *out, const struct poly2 *in,
+ size_t bits) {
+ assert(bits >= BITS_PER_WORD && bits % BITS_PER_WORD == 0);
+ assert(out != in);
+
+ const size_t start = bits / BITS_PER_WORD;
+ const size_t n = (N - bits) / BITS_PER_WORD;
+
+ // The rotate is by a whole number of words so the first few words are easy:
+ // just move them down.
+ for (size_t i = 0; i < n; i++) {
+ out->v[i] = in->v[start + i];
+ }
+
+ // Since the last word is only partially filled, however, the remainder needs
+ // shifting and merging of words to take care of that.
+ crypto_word_t carry = in->v[WORDS_PER_POLY - 1];
+
+ for (size_t i = 0; i < start; i++) {
+ out->v[n + i] = carry | in->v[i] << BITS_IN_LAST_WORD;
+ carry = in->v[i] >> (BITS_PER_WORD - BITS_IN_LAST_WORD);
+ }
+
+ out->v[WORDS_PER_POLY - 1] = carry;
+}
+
+// poly2_rotr_bits performs a right-rotate on |in|, writing the result to |out|.
+// The shift count, |bits|, must be a power of two that is less than
+// |BITS_PER_WORD|.
+static void poly2_rotr_bits(struct poly2 *out, const struct poly2 *in,
+ size_t bits) {
+ assert(bits <= BITS_PER_WORD / 2);
+ assert(bits != 0);
+ assert((bits & (bits - 1)) == 0);
+ assert(out != in);
+
+ // BITS_PER_WORD/2 is the greatest legal value of |bits|. If
+ // |BITS_IN_LAST_WORD| is smaller than this then the code below doesn't work
+ // because more than the last word needs to carry down in the previous one and
+ // so on.
+ OPENSSL_STATIC_ASSERT(
+ BITS_IN_LAST_WORD >= BITS_PER_WORD / 2,
+ "there are more carry bits than fit in BITS_IN_LAST_WORD");
+
+ crypto_word_t carry = in->v[WORDS_PER_POLY - 1] << (BITS_PER_WORD - bits);
+
+ for (size_t i = WORDS_PER_POLY - 2; i < WORDS_PER_POLY; i--) {
+ out->v[i] = carry | in->v[i] >> bits;
+ carry = in->v[i] << (BITS_PER_WORD - bits);
+ }
+
+ crypto_word_t last_word = carry >> (BITS_PER_WORD - BITS_IN_LAST_WORD) |
+ in->v[WORDS_PER_POLY - 1] >> bits;
+ last_word &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+ out->v[WORDS_PER_POLY - 1] = last_word;
+}
+
+// HRSS_poly2_rotr_consttime right-rotates |p| by |bits| in constant-time.
+void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits) {
+ assert(bits <= N);
+ assert(p->v[WORDS_PER_POLY-1] >> BITS_IN_LAST_WORD == 0);
+
+ // Constant-time rotation is implemented by calculating the rotations of
+ // powers-of-two bits and throwing away the unneeded values. 2^9 (i.e. 512) is
+ // the largest power-of-two shift that we need to consider because 2^10 > N.
+#define HRSS_POLY2_MAX_SHIFT 9
+ size_t shift = HRSS_POLY2_MAX_SHIFT;
+ OPENSSL_STATIC_ASSERT((1 << (HRSS_POLY2_MAX_SHIFT + 1)) > N,
+ "maximum shift is too small");
+ OPENSSL_STATIC_ASSERT((1 << HRSS_POLY2_MAX_SHIFT) <= N,
+ "maximum shift is too large");
+ struct poly2 shifted;
+
+ for (; (UINT64_C(1) << shift) >= BITS_PER_WORD; shift--) {
+ poly2_rotr_words(&shifted, p, UINT64_C(1) << shift);
+ poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1));
+ }
+
+ for (; shift < HRSS_POLY2_MAX_SHIFT; shift--) {
+ poly2_rotr_bits(&shifted, p, UINT64_C(1) << shift);
+ poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1));
+ }
+#undef HRSS_POLY2_MAX_SHIFT
+}
+
+// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones.
+static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) {
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]);
+ a->v[i] ^= sum;
+ b->v[i] ^= sum;
+ }
+}
+
+// poly2_fmadd sets |out| to |out| + |in| * m, where m is either
+// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|.
+static void poly2_fmadd(struct poly2 *out, const struct poly2 *in,
+ crypto_word_t m) {
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ out->v[i] ^= in->v[i] & m;
+ }
+}
+
+// poly2_lshift1 left-shifts |p| by one bit.
+static void poly2_lshift1(struct poly2 *p) {
+ crypto_word_t carry = 0;
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1);
+ p->v[i] <<= 1;
+ p->v[i] |= carry;
+ carry = next_carry;
+ }
+}
+
+// poly2_rshift1 right-shifts |p| by one bit.
+static void poly2_rshift1(struct poly2 *p) {
+ crypto_word_t carry = 0;
+ for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) {
+ const crypto_word_t next_carry = p->v[i] & 1;
+ p->v[i] >>= 1;
+ p->v[i] |= carry << (BITS_PER_WORD - 1);
+ carry = next_carry;
+ }
+}
+
+// poly2_clear_top_bits clears the bits in the final word that are only for
+// alignment.
+static void poly2_clear_top_bits(struct poly2 *p) {
+ p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+}
+
+// poly2_top_bits_are_clear returns one iff the extra bits in the final words of
+// |p| are zero.
+static int poly2_top_bits_are_clear(const struct poly2 *p) {
+ return (p->v[WORDS_PER_POLY - 1] &
+ ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0;
+}
+
+// Ternary polynomials.
+
+// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
+// bitsliced across the |s| and |a| arrays, like this:
+//
+// s | a | value
+// -----------------
+// 0 | 0 | 0
+// 0 | 1 | 1
+// 1 | 0 | 2 (aka -1)
+// 1 | 1 | <invalid>
+//
+// ('s' is for sign, and 'a' just a letter.)
+//
+// Once bitsliced as such, the following circuits can be used to implement
+// addition and multiplication mod 3:
+//
+// (s3, a3) = (s1, a1) × (s2, a2)
+// s3 = (a1 ∧ s2) ⊕ (s1 ∧ a2)
+// a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2)
+//
+// (s3, a3) = (s1, a1) + (s2, a2)
+// x = (a1 ⊕ a2)
+// y = (s1 ⊕ s2) ⊕ (a1 ∧ a2)
+// z = (s1 ∧ s2)
+// s3 = y ∧ ¬x
+// a3 = z ∨ (x ∧ ¬y)
+//
+// Negating a value just involves swapping s and a.
+// struct poly3 {
+// struct poly2 s, a;
+// };
+
+OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) {
+ struct poly3 p;
+ OPENSSL_memcpy(&p, in, sizeof(p));
+ p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
+ p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
+
+ printf("{[");
+ for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
+ if (i) {
+ printf(" ");
+ }
+ printf(BN_HEX_FMT2, p.s.v[i]);
+ }
+ printf("] [");
+ for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
+ if (i) {
+ printf(" ");
+ }
+ printf(BN_HEX_FMT2, p.a.v[i]);
+ }
+ printf("]}\n");
+}
+
+static void poly3_zero(struct poly3 *p) {
+ poly2_zero(&p->s);
+ poly2_zero(&p->a);
+}
+
+// lsb_to_all replicates the least-significant bit of |v| to all bits of the
+// word. This is used in bit-slicing operations to make a vector from a fixed
+// value.
+static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); }
+
+// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma).
+static void poly3_mul_const(struct poly3 *p, crypto_word_t ms,
+ crypto_word_t ma) {
+ ms = lsb_to_all(ms);
+ ma = lsb_to_all(ma);
+
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ const crypto_word_t s = p->s.v[i];
+ const crypto_word_t a = p->a.v[i];
+ p->s.v[i] = (s & ma) ^ (ms & a);
+ p->a.v[i] = (ms & s) ^ (ma & a);
+ }
+}
+
+// poly3_rotr_consttime right-rotates |p| by |bits| in constant-time.
+static void poly3_rotr_consttime(struct poly3 *p, size_t bits) {
+ assert(bits <= N);
+ HRSS_poly2_rotr_consttime(&p->s, bits);
+ HRSS_poly2_rotr_consttime(&p->a, bits);
+}
+
+// poly3_fmadd sets |out| to |out| + |in|×m, where m is (ms, ma).
+static void poly3_fmadd(struct poly3 *RESTRICT out,
+ const struct poly3 *RESTRICT in, crypto_word_t ms,
+ crypto_word_t ma) {
+ // (See the multiplication and addition circuits given above.)
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ const crypto_word_t s = in->s.v[i];
+ const crypto_word_t a = in->a.v[i];
+ const crypto_word_t product_s = (s & ma) ^ (ms & a);
+ const crypto_word_t product_a = (ms & s) ^ (ma & a);
+
+ const crypto_word_t x = out->a.v[i] ^ product_a;
+ const crypto_word_t y =
+ (out->s.v[i] ^ product_s) ^ (out->a.v[i] & product_a);
+ const crypto_word_t z = (out->s.v[i] & product_s);
+ out->s.v[i] = y & ~x;
+ out->a.v[i] = z | (x & ~y);
+ }
+}
+
+// final_bit_to_all replicates the bit in the final position of the last word to
+// all the bits in the word.
+static crypto_word_t final_bit_to_all(crypto_word_t v) {
+ return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1));
+}
+
+// poly3_top_bits_are_clear returns one iff the extra bits in the final words of
+// |p| are zero.
+OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) {
+ return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a);
+}
+
+// poly3_mod_phiN reduces |p| by Φ(N).
+static void poly3_mod_phiN(struct poly3 *p) {
+ // In order to reduce by Φ(N) we subtract by the value of the greatest
+ // coefficient. That's the same as adding the negative of its value. The
+ // negative of (s, a) is (a, s), so the arguments are swapped in the following
+ // two lines.
+ const crypto_word_t factor_s = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]);
+ const crypto_word_t factor_a = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]);
+
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ const crypto_word_t s = p->s.v[i];
+ const crypto_word_t a = p->a.v[i];
+ const crypto_word_t x = a ^ factor_a;
+ const crypto_word_t y = (s ^ factor_s) ^ (a & factor_a);
+ const crypto_word_t z = (s & factor_s);
+ p->s.v[i] = y & ~x;
+ p->a.v[i] = z | (x & ~y);
+ }
+
+ poly2_clear_top_bits(&p->s);
+ poly2_clear_top_bits(&p->a);
+}
+
+static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) {
+ poly2_cswap(&a->s, &b->s, swap);
+ poly2_cswap(&a->a, &b->a, swap);
+}
+
+static void poly3_lshift1(struct poly3 *p) {
+ poly2_lshift1(&p->s);
+ poly2_lshift1(&p->a);
+}
+
+static void poly3_rshift1(struct poly3 *p) {
+ poly2_rshift1(&p->s);
+ poly2_rshift1(&p->a);
+}
+
+// poly3_span represents a pointer into a poly3.
+struct poly3_span {
+ crypto_word_t *s;
+ crypto_word_t *a;
+};
+
+// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|).
+static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a,
+ const crypto_word_t s1, const crypto_word_t a1,
+ const crypto_word_t s2, const crypto_word_t a2) {
+ const crypto_word_t x = a1 ^ a2;
+ const crypto_word_t y = (s1 ^ s2) ^ (a1 & a2);
+ const crypto_word_t z = s1 & s2;
+ *out_s = y & ~x;
+ *out_a = z | (x & ~y);
+}
+
+// poly3_span_add adds |n| words of values from |a| and |b| and writes the
+// result to |out|.
+static void poly3_span_add(const struct poly3_span *out,
+ const struct poly3_span *a,
+ const struct poly3_span *b, size_t n) {
+ for (size_t i = 0; i < n; i++) {
+ poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
+ }
+}
+
+// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|.
+static void poly3_span_sub(const struct poly3_span *a,
+ const struct poly3_span *b, size_t n) {
+ for (size_t i = 0; i < n; i++) {
+ // Swapping |b->s| and |b->a| negates the value being added.
+ poly3_word_add(&a->s[i], &a->a[i], a->s[i], a->a[i], b->a[i], b->s[i]);
+ }
+}
+
+// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and
+// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of
+// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't
+// used and the recursion stops. For |n| in {11, 22}, the transitive total
+// amount of |scratch| needed happens to be 2n+2.
+static void poly3_mul_aux(const struct poly3_span *out,
+ const struct poly3_span *scratch,
+ const struct poly3_span *a,
+ const struct poly3_span *b, size_t n) {
+ if (n == 1) {
+ crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
+ crypto_word_t b_s = b->s[0], b_a = b->a[0];
+ const crypto_word_t a_s = a->s[0], a_a = a->a[0];
+
+ for (size_t i = 0; i < BITS_PER_WORD; i++) {
+ // Multiply (s, a) by the next value from (b_s, b_a).
+ const crypto_word_t v_s = lsb_to_all(b_s);
+ const crypto_word_t v_a = lsb_to_all(b_a);
+ b_s >>= 1;
+ b_a >>= 1;
+
+ const crypto_word_t m_s = (v_s & a_a) ^ (a_s & v_a);
+ const crypto_word_t m_a = (a_s & v_s) ^ (a_a & v_a);
+
+ if (i == 0) {
+ // Special case otherwise the code tries to shift by BITS_PER_WORD
+ // below, which is undefined.
+ r_s_low = m_s;
+ r_a_low = m_a;
+ continue;
+ }
+
+ // Shift the multiplication result to the correct position.
+ const crypto_word_t m_s_low = m_s << i;
+ const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i);
+ const crypto_word_t m_a_low = m_a << i;
+ const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i);
+
+ // Add into the result.
+ poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
+ poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
+ m_a_high);
+ }
+
+ out->s[0] = r_s_low;
+ out->s[1] = r_s_high;
+ out->a[0] = r_a_low;
+ out->a[1] = r_a_high;
+ return;
+ }
+
+ // Karatsuba multiplication.
+ // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+
+ // When |n| is odd, the two "halves" will have different lengths. The first
+ // is always the smaller.
+ const size_t low_len = n / 2;
+ const size_t high_len = n - low_len;
+ const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]};
+ const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]};
+
+ // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
+ // half.
+ const struct poly3_span a_cross_sum = *out;
+ const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]};
+ poly3_span_add(&a_cross_sum, a, &a_high, low_len);
+ poly3_span_add(&b_cross_sum, b, &b_high, low_len);
+ if (high_len != low_len) {
+ a_cross_sum.s[low_len] = a_high.s[low_len];
+ a_cross_sum.a[low_len] = a_high.a[low_len];
+ b_cross_sum.s[low_len] = b_high.s[low_len];
+ b_cross_sum.a[low_len] = b_high.a[low_len];
+ }
+
+ const struct poly3_span child_scratch = {&scratch->s[2 * high_len],
+ &scratch->a[2 * high_len]};
+ const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]};
+ const struct poly3_span out_high = {&out->s[2 * low_len],
+ &out->a[2 * low_len]};
+
+ // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
+ poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len);
+ // Calculate a_1 × b_1.
+ poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
+ // Calculate a_0 × b_0.
+ poly3_mul_aux(out, &child_scratch, a, b, low_len);
+
+ // Subtract those last two products from the first.
+ poly3_span_sub(scratch, out, low_len * 2);
+ poly3_span_sub(scratch, &out_high, high_len * 2);
+
+ // Add the middle product into the output.
+ poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2);
+}
+
+// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N).
+void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
+ const struct poly3 *y) {
+ crypto_word_t prod_s[WORDS_PER_POLY * 2];
+ crypto_word_t prod_a[WORDS_PER_POLY * 2];
+ crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2];
+ crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2];
+ const struct poly3_span prod_span = {prod_s, prod_a};
+ const struct poly3_span scratch_span = {scratch_s, scratch_a};
+ const struct poly3_span x_span = {(crypto_word_t *)x->s.v,
+ (crypto_word_t *)x->a.v};
+ const struct poly3_span y_span = {(crypto_word_t *)y->s.v,
+ (crypto_word_t *)y->a.v};
+
+ poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY);
+
+ // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
+ // upper-half to the lower-half. However, N is 701, which isn't a multiple of
+ // BITS_PER_WORD, so the upper-half vectors all have to be shifted before
+ // being added to the lower-half.
+ for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+ crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
+ v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
+ crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
+ v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
+
+ poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a);
+ }
+
+ poly3_mod_phiN(out);
+}
+
+#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
+
+// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is
+// |0xff..ff|. Otherwise, |swap| must be zero.
+static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
+ vec_t b_a[6], const vec_t swap) {
+ for (int i = 0; i < 6; i++) {
+ const vec_t sum_s = swap & (a_s[i] ^ b_s[i]);
+ a_s[i] ^= sum_s;
+ b_s[i] ^= sum_s;
+
+ const vec_t sum_a = swap & (a_a[i] ^ b_a[i]);
+ a_a[i] ^= sum_a;
+ b_a[i] ^= sum_a;
+ }
+}
+
+// poly3_vec_fmadd adds (|ms|, |ma|) × (|b_s|, |b_a|) to (|a_s|, |a_a|).
+static inline void poly3_vec_fmadd(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
+ vec_t b_a[6], const vec_t ms,
+ const vec_t ma) {
+ for (int i = 0; i < 6; i++) {
+ const vec_t s = b_s[i];
+ const vec_t a = b_a[i];
+ const vec_t product_s = (s & ma) ^ (ms & a);
+ const vec_t product_a = (ms & s) ^ (ma & a);
+
+ const vec_t x = a_a[i] ^ product_a;
+ const vec_t y = (a_s[i] ^ product_s) ^ (a_a[i] & product_a);
+ const vec_t z = (a_s[i] & product_s);
+ a_s[i] = y & ~x;
+ a_a[i] = z | (x & ~y);
+ }
+}
+
+// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
+// Φ(N).
+static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) {
+ // See the comment in |HRSS_poly3_invert| about this algorithm. In addition to
+ // the changes described there, this implementation attempts to use vector
+ // registers to speed up the computation. Even non-poly3 variables are held in
+ // vectors where possible to minimise the amount of data movement between
+ // the vector and general-purpose registers.
+
+ vec_t b_s[6], b_a[6], c_s[6], c_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
+ const vec_t kZero = {0};
+ const vec_t kOne = {1};
+ static const uint8_t kOneBytes[sizeof(vec_t)] = {1};
+ static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
+
+ memset(b_s, 0, sizeof(b_s));
+ memcpy(b_a, kOneBytes, sizeof(kOneBytes));
+ memset(&b_a[1], 0, 5 * sizeof(vec_t));
+
+ memset(c_s, 0, sizeof(c_s));
+ memset(c_a, 0, sizeof(c_a));
+
+ f_s[5] = kZero;
+ memcpy(f_s, in->s.v, WORDS_PER_POLY * sizeof(crypto_word_t));
+ f_a[5] = kZero;
+ memcpy(f_a, in->a.v, WORDS_PER_POLY * sizeof(crypto_word_t));
+
+ // Set g to all ones.
+ memset(g_s, 0, sizeof(g_s));
+ memset(g_a, 0xff, 5 * sizeof(vec_t));
+ memcpy(&g_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne));
+
+ vec_t deg_f = {N - 1}, deg_g = {N - 1}, rotation = kZero;
+ vec_t k = kOne;
+ vec_t f0s = {0}, f0a = {0};
+ vec_t still_going;
+ memset(&still_going, 0xff, sizeof(still_going));
+
+ for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+ const vec_t s_a = vec_broadcast_bit(
+ still_going & ((f_a[0] & g_s[0]) ^ (f_s[0] & g_a[0])));
+ const vec_t s_s = vec_broadcast_bit(
+ still_going & ((f_a[0] & g_a[0]) ^ (f_s[0] & g_s[0])));
+ const vec_t should_swap =
+ (s_s | s_a) & vec_broadcast_bit15(deg_f - deg_g);
+
+ poly3_vec_cswap(f_s, f_a, g_s, g_a, should_swap);
+ poly3_vec_fmadd(f_s, f_a, g_s, g_a, s_s, s_a);
+ poly3_vec_rshift1(f_s, f_a);
+
+ poly3_vec_cswap(b_s, b_a, c_s, c_a, should_swap);
+ poly3_vec_fmadd(b_s, b_a, c_s, c_a, s_s, s_a);
+ poly3_vec_lshift1(c_s, c_a);
+
+ const vec_t deg_sum = should_swap & (deg_f ^ deg_g);
+ deg_f ^= deg_sum;
+ deg_g ^= deg_sum;
+
+ deg_f -= kOne;
+ still_going &= ~vec_broadcast_bit15(deg_f - kOne);
+
+ const vec_t f0_is_nonzero = vec_broadcast_bit(f_s[0] | f_a[0]);
+ // |f0_is_nonzero| implies |still_going|.
+ rotation ^= f0_is_nonzero & (k ^ rotation);
+ k += kOne;
+
+ const vec_t f0s_sum = f0_is_nonzero & (f_s[0] ^ f0s);
+ f0s ^= f0s_sum;
+ const vec_t f0a_sum = f0_is_nonzero & (f_a[0] ^ f0a);
+ f0a ^= f0a_sum;
+ }
+
+ crypto_word_t rotation_word = vec_get_word(rotation, 0);
+ rotation_word -= N & constant_time_lt_w(N, rotation_word);
+ memcpy(out->s.v, b_s, WORDS_PER_POLY * sizeof(crypto_word_t));
+ memcpy(out->a.v, b_a, WORDS_PER_POLY * sizeof(crypto_word_t));
+ assert(poly3_top_bits_are_clear(out));
+ poly3_rotr_consttime(out, rotation_word);
+ poly3_mul_const(out, vec_get_word(f0s, 0), vec_get_word(f0a, 0));
+ poly3_mod_phiN(out);
+}
+
+#endif // HRSS_HAVE_VECTOR_UNIT
+
+// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
+// Φ(N).
+void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) {
+ // The vector version of this function seems slightly slower on AArch64, but
+ // is useful on ARMv7 and x86-64.
+#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
+ if (vec_capable()) {
+ poly3_invert_vec(out, in);
+ return;
+ }
+#endif
+
+ // This algorithm mostly follows algorithm 10 in the paper. Some changes:
+ // 1) k should start at zero, not one. In the code below k is omitted and
+ // the loop counter, |i|, is used instead.
+ // 2) The rotation count is conditionally updated to handle trailing zero
+ // coefficients.
+ // The best explanation for why it works is in the "Why it works" section of
+ // [NTRUTN14].
+
+ struct poly3 c, f, g;
+ OPENSSL_memcpy(&f, in, sizeof(f));
+
+ // Set g to all ones.
+ OPENSSL_memset(&g.s, 0, sizeof(struct poly2));
+ OPENSSL_memset(&g.a, 0xff, sizeof(struct poly2));
+ g.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
+
+ struct poly3 *b = out;
+ poly3_zero(b);
+ poly3_zero(&c);
+ // Set b to one.
+ b->a.v[0] = 1;
+
+ crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0;
+ crypto_word_t f0s = 0, f0a = 0;
+ crypto_word_t still_going = CONSTTIME_TRUE_W;
+
+ for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+ const crypto_word_t s_a = lsb_to_all(
+ still_going & ((f.a.v[0] & g.s.v[0]) ^ (f.s.v[0] & g.a.v[0])));
+ const crypto_word_t s_s = lsb_to_all(
+ still_going & ((f.a.v[0] & g.a.v[0]) ^ (f.s.v[0] & g.s.v[0])));
+ const crypto_word_t should_swap =
+ (s_s | s_a) & constant_time_lt_w(deg_f, deg_g);
+
+ poly3_cswap(&f, &g, should_swap);
+ poly3_cswap(b, &c, should_swap);
+
+ const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g);
+ deg_f ^= deg_sum;
+ deg_g ^= deg_sum;
+ assert(deg_g >= 1);
+
+ poly3_fmadd(&f, &g, s_s, s_a);
+ poly3_fmadd(b, &c, s_s, s_a);
+ poly3_rshift1(&f);
+ poly3_lshift1(&c);
+
+ deg_f--;
+ const crypto_word_t f0_is_nonzero =
+ lsb_to_all(f.s.v[0]) | lsb_to_all(f.a.v[0]);
+ // |f0_is_nonzero| implies |still_going|.
+ assert(!(f0_is_nonzero && !still_going));
+ still_going &= ~constant_time_is_zero_w(deg_f);
+
+ rotation = constant_time_select_w(f0_is_nonzero, i, rotation);
+ f0s = constant_time_select_w(f0_is_nonzero, f.s.v[0], f0s);
+ f0a = constant_time_select_w(f0_is_nonzero, f.a.v[0], f0a);
+ }
+
+ rotation++;
+ rotation -= N & constant_time_lt_w(N, rotation);
+ assert(poly3_top_bits_are_clear(out));
+ poly3_rotr_consttime(out, rotation);
+ poly3_mul_const(out, f0s, f0a);
+ poly3_mod_phiN(out);
+}
+
+// Polynomials in Q.
+
+// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the
+// coefficients do not form a field.)
+#define Q 8192
+
+// VECS_PER_POLY is the number of 128-bit vectors needed to represent a
+// polynomial.
+#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
+#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
+
+// poly represents a polynomial with coefficients mod Q. Note that, while Q is a
+// power of two, this does not operate in GF(Q). That would be a binary field
+// but this is simply mod Q. Thus the coefficients are not a field.
+//
+// Coefficients are ordered little-endian, thus the coefficient of x^0 is the
+// first element of the array.
+struct poly {
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+ union {
+ // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for
+ // the vector code.
+ uint16_t v[N + 3];
+ vec_t vectors[VECS_PER_POLY];
+ };
+#else
+ // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that
+ // requires alignment.
+ alignas(16) uint16_t v[N + 3];
+#endif
+};
+
+OPENSSL_UNUSED static void poly_print(const struct poly *p) {
+ printf("[");
+ for (unsigned i = 0; i < N; i++) {
+ if (i) {
+ printf(" ");
+ }
+ printf("%d", p->v[i]);
+ }
+ printf("]\n");
+}
+
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+
+// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a|
+// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements
+// of |scratch| and the function recurses, except if |n| < 3, when |scratch|
+// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch|
+// needs 172 elements.
+static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch,
+ const vec_t *restrict a, const vec_t *restrict b,
+ const size_t n) {
+ // In [HRSS], the technique they used for polynomial multiplication is
+ // described: they start with Toom-4 at the top level and then two layers of
+ // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook
+ // decomposition, which splits an input n-ways and produces 2n-1
+ // multiplications of those parts. So, starting with 704 coefficients (rounded
+ // up from 701 to have more factors of two), Toom-4 gives seven
+ // multiplications of degree-174 polynomials. Each round of Karatsuba (which
+ // is Toom-2) increases the number of multiplications by a factor of three
+ // while halving the size of the values being multiplied. So two rounds gives
+ // 63 multiplications of degree-44 polynomials. Then they (I think) form
+ // vectors by gathering all 63 coefficients of each power together, for each
+ // input, and doing more rounds of Karatsuba on the vectors until they bottom-
+ // out somewhere with schoolbook multiplication.
+ //
+ // I tried something like that for NEON. NEON vectors are 128 bits so hold
+ // eight coefficients. I wrote a function that did Karatsuba on eight
+ // multiplications at the same time, using such vectors, and a Go script that
+ // decomposed from degree-704, with Karatsuba in non-transposed form, until it
+ // reached multiplications of degree-44. It batched up those 81
+ // multiplications into lots of eight with a single one left over (which was
+ // handled directly).
+ //
+ // It worked, but it was significantly slower than the dumb algorithm used
+ // below. Potentially that was because I misunderstood how [HRSS] did it, or
+ // because Clang is bad at generating good code from NEON intrinsics on ARMv7.
+ // (Which is true: the code generated by Clang for the below is pretty crap.)
+ //
+ // This algorithm is much simpler. It just does Karatsuba decomposition all
+ // the way down and never transposes. When it gets down to degree-16 or
+ // degree-24 values, they are multiplied using schoolbook multiplication and
+ // vector intrinsics. The vector operations form each of the eight phase-
+ // shifts of one of the inputs, point-wise multiply, and then add into the
+ // result at the correct place. This means that 33% (degree-16) or 25%
+ // (degree-24) of the multiplies and adds are wasted, but it does ok.
+ if (n == 2) {
+ vec_t result[4];
+ vec_t vec_a[3];
+ static const vec_t kZero = {0};
+ vec_a[0] = a[0];
+ vec_a[1] = a[1];
+ vec_a[2] = kZero;
+
+ result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0));
+ result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0));
+
+ result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0));
+ result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0));
+ result[3] = kZero;
+
+ vec3_rshift_word(vec_a);
+
+#define BLOCK(x, y) \
+ do { \
+ result[x + 0] = \
+ vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 1] = \
+ vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 2] = \
+ vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+ } while (0)
+
+ BLOCK(0, 1);
+ BLOCK(1, 9);
+
+ vec3_rshift_word(vec_a);
+
+ BLOCK(0, 2);
+ BLOCK(1, 10);
+
+ vec3_rshift_word(vec_a);
+
+ BLOCK(0, 3);
+ BLOCK(1, 11);
+
+ vec3_rshift_word(vec_a);
+
+ BLOCK(0, 4);
+ BLOCK(1, 12);
+
+ vec3_rshift_word(vec_a);
+
+ BLOCK(0, 5);
+ BLOCK(1, 13);
+
+ vec3_rshift_word(vec_a);
+
+ BLOCK(0, 6);
+ BLOCK(1, 14);
+
+ vec3_rshift_word(vec_a);
+
+ BLOCK(0, 7);
+ BLOCK(1, 15);
+
+#undef BLOCK
+
+ memcpy(out, result, sizeof(result));
+ return;
+ }
+
+ if (n == 3) {
+ vec_t result[6];
+ vec_t vec_a[4];
+ static const vec_t kZero = {0};
+ vec_a[0] = a[0];
+ vec_a[1] = a[1];
+ vec_a[2] = a[2];
+ vec_a[3] = kZero;
+
+ result[0] = vec_mul(a[0], vec_get_word(b[0], 0));
+ result[1] = vec_mul(a[1], vec_get_word(b[0], 0));
+ result[2] = vec_mul(a[2], vec_get_word(b[0], 0));
+
+#define BLOCK_PRE(x, y) \
+ do { \
+ result[x + 0] = \
+ vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 1] = \
+ vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+ } while (0)
+
+ BLOCK_PRE(1, 8);
+ BLOCK_PRE(2, 16);
+
+ result[5] = kZero;
+
+ vec4_rshift_word(vec_a);
+
+#define BLOCK(x, y) \
+ do { \
+ result[x + 0] = \
+ vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 1] = \
+ vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 2] = \
+ vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+ result[x + 3] = \
+ vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
+ } while (0)
+
+ BLOCK(0, 1);
+ BLOCK(1, 9);
+ BLOCK(2, 17);
+
+ vec4_rshift_word(vec_a);
+
+ BLOCK(0, 2);
+ BLOCK(1, 10);
+ BLOCK(2, 18);
+
+ vec4_rshift_word(vec_a);
+
+ BLOCK(0, 3);
+ BLOCK(1, 11);
+ BLOCK(2, 19);
+
+ vec4_rshift_word(vec_a);
+
+ BLOCK(0, 4);
+ BLOCK(1, 12);
+ BLOCK(2, 20);
+
+ vec4_rshift_word(vec_a);
+
+ BLOCK(0, 5);
+ BLOCK(1, 13);
+ BLOCK(2, 21);
+
+ vec4_rshift_word(vec_a);
+
+ BLOCK(0, 6);
+ BLOCK(1, 14);
+ BLOCK(2, 22);
+
+ vec4_rshift_word(vec_a);
+
+ BLOCK(0, 7);
+ BLOCK(1, 15);
+ BLOCK(2, 23);
+
+#undef BLOCK
+#undef BLOCK_PRE
+
+ memcpy(out, result, sizeof(result));
+
+ return;
+ }
+
+ // Karatsuba multiplication.
+ // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+
+ // When |n| is odd, the two "halves" will have different lengths. The first is
+ // always the smaller.
+ const size_t low_len = n / 2;
+ const size_t high_len = n - low_len;
+ const vec_t *a_high = &a[low_len];
+ const vec_t *b_high = &b[low_len];
+
+ // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
+ // half.
+ for (size_t i = 0; i < low_len; i++) {
+ out[i] = vec_add(a_high[i], a[i]);
+ out[high_len + i] = vec_add(b_high[i], b[i]);
+ }
+ if (high_len != low_len) {
+ out[low_len] = a_high[low_len];
+ out[high_len + low_len] = b_high[low_len];
+ }
+
+ vec_t *const child_scratch = &scratch[2 * high_len];
+ // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
+ poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len);
+ // Calculate a_1 × b_1.
+ poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len);
+ // Calculate a_0 × b_0.
+ poly_mul_vec_aux(out, child_scratch, a, b, low_len);
+
+ // Subtract those last two products from the first.
+ for (size_t i = 0; i < low_len * 2; i++) {
+ scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i]));
+ }
+ if (low_len != high_len) {
+ scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]);
+ scratch[low_len * 2 + 1] =
+ vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]);
+ }
+
+ // Add the middle product into the output.
+ for (size_t i = 0; i < high_len * 2; i++) {
+ out[low_len + i] = vec_add(out[low_len + i], scratch[i]);
+ }
+}
+
+// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1).
+static void poly_mul_vec(struct poly *out, const struct poly *x,
+ const struct poly *y) {
+ OPENSSL_memset((uint16_t *)&x->v[N], 0, 3 * sizeof(uint16_t));
+ OPENSSL_memset((uint16_t *)&y->v[N], 0, 3 * sizeof(uint16_t));
+
+ OPENSSL_STATIC_ASSERT(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY,
+ "struct poly is the wrong size");
+ OPENSSL_STATIC_ASSERT(alignof(struct poly) == alignof(vec_t),
+ "struct poly has incorrect alignment");
+
+ vec_t prod[VECS_PER_POLY * 2];
+ vec_t scratch[172];
+ poly_mul_vec_aux(prod, scratch, x->vectors, y->vectors, VECS_PER_POLY);
+
+ // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
+ // upper-half to the lower-half. However, N is 701, which isn't a multiple of
+ // the vector size, so the upper-half vectors all have to be shifted before
+ // being added to the lower-half.
+ vec_t *out_vecs = (vec_t *)out->v;
+
+ for (size_t i = 0; i < VECS_PER_POLY; i++) {
+ const vec_t prev = prod[VECS_PER_POLY - 1 + i];
+ const vec_t this = prod[VECS_PER_POLY + i];
+ out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this));
+ }
+
+ OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
+}
+
+#endif // HRSS_HAVE_VECTOR_UNIT
+
+// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using
+// |scratch| as scratch space. It'll use Karatsuba if the inputs are large
+// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and
+// the function recurses, except if |n| < 64, when |scratch| isn't used and the
+// recursion stops. If |n| == |N| then |scratch| needs 1318 elements.
+static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch,
+ const uint16_t *a, const uint16_t *b, size_t n) {
+ static const size_t kSchoolbookLimit = 64;
+ if (n < kSchoolbookLimit) {
+ OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2);
+ for (size_t i = 0; i < n; i++) {
+ for (size_t j = 0; j < n; j++) {
+ out[i + j] += (unsigned) a[i] * b[j];
+ }
+ }
+
+ return;
+ }
+
+ // Karatsuba multiplication.
+ // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+
+ // When |n| is odd, the two "halves" will have different lengths. The
+ // first is always the smaller.
+ const size_t low_len = n / 2;
+ const size_t high_len = n - low_len;
+ const uint16_t *const a_high = &a[low_len];
+ const uint16_t *const b_high = &b[low_len];
+
+ for (size_t i = 0; i < low_len; i++) {
+ out[i] = a_high[i] + a[i];
+ out[high_len + i] = b_high[i] + b[i];
+ }
+ if (high_len != low_len) {
+ out[low_len] = a_high[low_len];
+ out[high_len + low_len] = b_high[low_len];
+ }
+
+ uint16_t *const child_scratch = &scratch[2 * high_len];
+ poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len);
+ poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high,
+ high_len);
+ poly_mul_novec_aux(out, child_scratch, a, b, low_len);
+
+ for (size_t i = 0; i < low_len * 2; i++) {
+ scratch[i] -= out[i] + out[low_len * 2 + i];
+ }
+ if (low_len != high_len) {
+ scratch[low_len * 2] -= out[low_len * 4];
+ assert(out[low_len * 4 + 1] == 0);
+ }
+
+ for (size_t i = 0; i < high_len * 2; i++) {
+ out[low_len + i] += scratch[i];
+ }
+}
+
+// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1).
+static void poly_mul_novec(struct poly *out, const struct poly *x,
+ const struct poly *y) {
+ uint16_t prod[2 * N];
+ uint16_t scratch[1318];
+ poly_mul_novec_aux(prod, scratch, x->v, y->v, N);
+
+ for (size_t i = 0; i < N; i++) {
+ out->v[i] = prod[i] + prod[i + N];
+ }
+ OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
+}
+
+// On x86-64, we can use the AVX2 code from [HRSS]. (The authors have given
+// explicit permission for this and signed a CLA.) However it's 57KB of object
+// code, so it's not used if |OPENSSL_SMALL| is defined.
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \
+ defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX)
+// poly_Rq_mul is defined in assembly.
+extern void poly_Rq_mul(struct poly *r, const struct poly *a,
+ const struct poly *b);
+#endif
+
+static void poly_mul(struct poly *r, const struct poly *a,
+ const struct poly *b) {
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \
+ defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX)
+ const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
+ if (has_avx2) {
+ poly_Rq_mul(r, a, b);
+ return;
+ }
+#endif
+
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+ if (vec_capable()) {
+ poly_mul_vec(r, a, b);
+ return;
+ }
+#endif
+
+ // Fallback, non-vector case.
+ poly_mul_novec(r, a, b);
+}
+
+// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1).
+static void poly_mul_x_minus_1(struct poly *p) {
+ // Multiplying by (𝑥 - 1) means negating each coefficient and adding in
+ // the value of the previous one.
+ const uint16_t orig_final_coefficient = p->v[N - 1];
+
+ for (size_t i = N - 1; i > 0; i--) {
+ p->v[i] = p->v[i - 1] - p->v[i];
+ }
+ p->v[0] = orig_final_coefficient - p->v[0];
+}
+
+// poly_mod_phiN sets |p| to |p| mod Φ(N).
+static void poly_mod_phiN(struct poly *p) {
+ const uint16_t coeff700 = p->v[N - 1];
+
+ for (unsigned i = 0; i < N; i++) {
+ p->v[i] -= coeff700;
+ }
+}
+
+// poly_clamp reduces each coefficient mod Q.
+static void poly_clamp(struct poly *p) {
+ for (unsigned i = 0; i < N; i++) {
+ p->v[i] &= Q - 1;
+ }
+}
+
+
+// Conversion functions
+// --------------------
+
+// poly2_from_poly sets |*out| to |in| mod 2.
+static void poly2_from_poly(struct poly2 *out, const struct poly *in) {
+ crypto_word_t *words = out->v;
+ unsigned shift = 0;
+ crypto_word_t word = 0;
+
+ for (unsigned i = 0; i < N; i++) {
+ word >>= 1;
+ word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1);
+ shift++;
+
+ if (shift == BITS_PER_WORD) {
+ *words = word;
+ words++;
+ word = 0;
+ shift = 0;
+ }
+ }
+
+ word >>= BITS_PER_WORD - shift;
+ *words = word;
+}
+
+// mod3 treats |a| is a signed number and returns |a| mod 3.
+static uint16_t mod3(int16_t a) {
+ const int16_t q = ((int32_t)a * 21845) >> 16;
+ int16_t ret = a - 3 * q;
+ // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0,
+ // 1, 2, 0}.
+ return ret & ((ret & (ret >> 1)) - 1);
+}
+
+// poly3_from_poly sets |*out| to |in|.
+static void poly3_from_poly(struct poly3 *out, const struct poly *in) {
+ crypto_word_t *words_s = out->s.v;
+ crypto_word_t *words_a = out->a.v;
+ crypto_word_t s = 0;
+ crypto_word_t a = 0;
+ unsigned shift = 0;
+
+ for (unsigned i = 0; i < N; i++) {
+ // This duplicates the 13th bit upwards to the top of the uint16,
+ // essentially treating it as a sign bit and converting into a signed int16.
+ // The signed value is reduced mod 3, yielding {0, 1, 2}.
+ const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3);
+ s >>= 1;
+ s |= (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2);
+ a >>= 1;
+ a |= (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1);
+ shift++;
+
+ if (shift == BITS_PER_WORD) {
+ *words_s = s;
+ words_s++;
+ *words_a = a;
+ words_a++;
+ s = a = 0;
+ shift = 0;
+ }
+ }
+
+ s >>= BITS_PER_WORD - shift;
+ a >>= BITS_PER_WORD - shift;
+ *words_s = s;
+ *words_a = a;
+}
+
+// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1,
+// Q-1}. It returns a mask indicating whether all coefficients were found to be
+// in that set.
+static crypto_word_t poly3_from_poly_checked(struct poly3 *out,
+ const struct poly *in) {
+ crypto_word_t *words_s = out->s.v;
+ crypto_word_t *words_a = out->a.v;
+ crypto_word_t s = 0;
+ crypto_word_t a = 0;
+ unsigned shift = 0;
+ crypto_word_t ok = CONSTTIME_TRUE_W;
+
+ for (unsigned i = 0; i < N; i++) {
+ const uint16_t v = in->v[i];
+ // Maps {0, 1, Q-1} to {0, 1, 2}.
+ uint16_t mod3 = v & 3;
+ mod3 ^= mod3 >> 1;
+ const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q;
+ ok &= constant_time_eq_w(v, expected);
+
+ s >>= 1;
+ s |= (crypto_word_t)(mod3 & 2) << (BITS_PER_WORD - 2);
+ a >>= 1;
+ a |= (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1);
+ shift++;
+
+ if (shift == BITS_PER_WORD) {
+ *words_s = s;
+ words_s++;
+ *words_a = a;
+ words_a++;
+ s = a = 0;
+ shift = 0;
+ }
+ }
+
+ s >>= BITS_PER_WORD - shift;
+ a >>= BITS_PER_WORD - shift;
+ *words_s = s;
+ *words_a = a;
+
+ return ok;
+}
+
+static void poly_from_poly2(struct poly *out, const struct poly2 *in) {
+ const crypto_word_t *words = in->v;
+ unsigned shift = 0;
+ crypto_word_t word = *words;
+
+ for (unsigned i = 0; i < N; i++) {
+ out->v[i] = word & 1;
+ word >>= 1;
+ shift++;
+
+ if (shift == BITS_PER_WORD) {
+ words++;
+ word = *words;
+ shift = 0;
+ }
+ }
+}
+
+static void poly_from_poly3(struct poly *out, const struct poly3 *in) {
+ const crypto_word_t *words_s = in->s.v;
+ const crypto_word_t *words_a = in->a.v;
+ crypto_word_t word_s = ~(*words_s);
+ crypto_word_t word_a = *words_a;
+ unsigned shift = 0;
+
+ for (unsigned i = 0; i < N; i++) {
+ out->v[i] = (uint16_t)(word_s & 1) - 1;
+ out->v[i] |= word_a & 1;
+ word_s >>= 1;
+ word_a >>= 1;
+ shift++;
+
+ if (shift == BITS_PER_WORD) {
+ words_s++;
+ words_a++;
+ word_s = ~(*words_s);
+ word_a = *words_a;
+ shift = 0;
+ }
+ }
+}
+
+// Polynomial inversion
+// --------------------
+
+// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod
+// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion
+// mod Q.
+static void poly_invert_mod2(struct poly *out, const struct poly *in) {
+ // This algorithm follows algorithm 10 in the paper. (Although, in contrast to
+ // the paper, k should start at zero, not one, and the rotation count is needs
+ // to handle trailing zero coefficients.) The best explanation for why it
+ // works is in the "Why it works" section of [NTRUTN14].
+
+ struct poly2 b, c, f, g;
+ poly2_from_poly(&f, in);
+ OPENSSL_memset(&b, 0, sizeof(b));
+ b.v[0] = 1;
+ OPENSSL_memset(&c, 0, sizeof(c));
+
+ // Set g to all ones.
+ OPENSSL_memset(&g, 0xff, sizeof(struct poly2));
+ g.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
+
+ crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0;
+ crypto_word_t still_going = CONSTTIME_TRUE_W;
+
+ for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+ const crypto_word_t s = still_going & lsb_to_all(f.v[0]);
+ const crypto_word_t should_swap = s & constant_time_lt_w(deg_f, deg_g);
+ poly2_cswap(&f, &g, should_swap);
+ poly2_cswap(&b, &c, should_swap);
+ const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g);
+ deg_f ^= deg_sum;
+ deg_g ^= deg_sum;
+ assert(deg_g >= 1);
+ poly2_fmadd(&f, &g, s);
+ poly2_fmadd(&b, &c, s);
+
+ poly2_rshift1(&f);
+ poly2_lshift1(&c);
+
+ deg_f--;
+ const crypto_word_t f0_is_nonzero = lsb_to_all(f.v[0]);
+ // |f0_is_nonzero| implies |still_going|.
+ assert(!(f0_is_nonzero && !still_going));
+ rotation = constant_time_select_w(f0_is_nonzero, i, rotation);
+ still_going &= ~constant_time_is_zero_w(deg_f);
+ }
+
+ rotation++;
+ rotation -= N & constant_time_lt_w(N, rotation);
+ assert(poly2_top_bits_are_clear(&b));
+ HRSS_poly2_rotr_consttime(&b, rotation);
+ poly_from_poly2(out, &b);
+}
+
+// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)).
+static void poly_invert(struct poly *out, const struct poly *in) {
+ // Inversion mod Q, which is done based on the result of inverting mod
+ // 2. See [NTRUTN14] paper, bottom of page two.
+ struct poly a, *b, tmp;
+
+ // a = -in.
+ for (unsigned i = 0; i < N; i++) {
+ a.v[i] = -in->v[i];
+ }
+
+ // b = in^-1 mod 2.
+ b = out;
+ poly_invert_mod2(b, in);
+
+ // We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
+ // times, which is four.
+ for (unsigned i = 0; i < 4; i++) {
+ poly_mul(&tmp, &a, b);
+ tmp.v[0] += 2;
+ poly_mul(b, b, &tmp);
+ }
+}
+
+// Marshal and unmarshal functions for various basic types.
+// --------------------------------------------------------
+
+#define POLY_BYTES 1138
+
+static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) {
+ const uint16_t *p = in->v;
+
+ for (size_t i = 0; i < N / 8; i++) {
+ out[0] = p[0];
+ out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
+ out[2] = p[1] >> 3;
+ out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
+ out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
+ out[5] = p[3] >> 1;
+ out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4);
+ out[7] = p[4] >> 4;
+ out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1);
+ out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6);
+ out[10] = p[6] >> 2;
+ out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3);
+ out[12] = p[7] >> 5;
+
+ p += 8;
+ out += 13;
+ }
+
+ // There are four remaining values.
+ out[0] = p[0];
+ out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
+ out[2] = p[1] >> 3;
+ out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
+ out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
+ out[5] = p[3] >> 1;
+ out[6] = 0xf & (p[3] >> 9);
+}
+
+static void poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) {
+ uint16_t *p = out->v;
+
+ for (size_t i = 0; i < N / 8; i++) {
+ p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
+ p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
+ (uint16_t)(in[3] & 3) << 11;
+ p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
+ p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
+ (uint16_t)(in[6] & 0xf) << 9;
+ p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 |
+ (uint16_t)(in[8] & 1) << 12;
+ p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7;
+ p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 |
+ (uint16_t)(in[11] & 7) << 10;
+ p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5;
+
+ p += 8;
+ in += 13;
+ }
+
+ // There are four coefficients remaining.
+ p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
+ p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
+ (uint16_t)(in[3] & 3) << 11;
+ p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
+ p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
+ (uint16_t)(in[6] & 0xf) << 9;
+
+ for (unsigned i = 0; i < N - 1; i++) {
+ out->v[i] = (int16_t)(out->v[i] << 3) >> 3;
+ }
+
+ // There are four unused bits at the top of the final byte. They are always
+ // marshaled as zero by this code but we allow them to take any value when
+ // parsing in order to support future extension.
+
+ // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6.
+ uint32_t sum = 0;
+ for (size_t i = 0; i < N - 1; i++) {
+ sum += out->v[i];
+ }
+
+ out->v[N - 1] = (uint16_t)(0u - sum);
+}
+
+// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may
+// have an invalid value when processing attacker-controlled inputs.
+static uint16_t mod3_from_modQ(uint16_t v) {
+ v &= 3;
+ return v ^ (v >> 1);
+}
+
+// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are
+// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may
+// have invalid values when processing attacker-controlled inputs.)
+static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES],
+ const struct poly *in) {
+ const uint16_t *coeffs = in->v;
+
+ // Only 700 coefficients are marshaled because in[700] must be zero.
+ assert(coeffs[N-1] == 0);
+
+ for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) {
+ const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]);
+ const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]);
+ const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]);
+ const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]);
+ const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]);
+ out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
+ coeffs += 5;
+ }
+}
+
+// HRSS-specific functions
+// -----------------------
+
+// poly_short_sample implements the sampling algorithm given in [HRSSNIST]
+// section 1.8.1. The output coefficients are in {0, 1, 0xffff} which makes some
+// later computation easier.
+static void poly_short_sample(struct poly *out,
+ const uint8_t in[HRSS_SAMPLE_BYTES]) {
+ // We wish to calculate the difference (mod 3) between two, two-bit numbers.
+ // Here is a table of results for a - b. Negative one is written as 0b11 so
+ // that a couple of shifts can be used to sign-extend it. Any input value of
+ // 0b11 is invalid and a convention is adopted that an invalid input results
+ // in an invalid output (0b10).
+ //
+ // b a result
+ // 00 00 00
+ // 00 01 01
+ // 00 10 11
+ // 00 11 10
+ // 01 00 11
+ // 01 01 00
+ // 01 10 01
+ // 01 11 10
+ // 10 00 01
+ // 10 01 11
+ // 10 10 00
+ // 10 11 10
+ // 11 00 10
+ // 11 01 10
+ // 11 10 10
+ // 11 11 10
+ //
+ // The result column is encoded in a single-word lookup-table:
+ // 0001 1110 1100 0110 0111 0010 1010 1010
+ // 1 d c 6 7 2 a a
+ static const uint32_t kLookup = 0x1dc672aa;
+
+ // In order to generate pairs of numbers mod 3 (non-uniformly) we treat pairs
+ // of bits in a uint32 as separate values and sum two random vectors of 1-bit
+ // numbers. This works because these pairs are isolated because no carry can
+ // spread between them.
+
+ uint16_t *p = out->v;
+ for (size_t i = 0; i < N / 8; i++) {
+ uint32_t v;
+ OPENSSL_memcpy(&v, in, sizeof(v));
+ in += sizeof(v);
+
+ uint32_t sums = (v & 0x55555555) + ((v >> 1) & 0x55555555);
+ for (unsigned j = 0; j < 8; j++) {
+ p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30;
+ sums >>= 4;
+ }
+ p += 8;
+ }
+
+ // There are four values remaining.
+ uint16_t v;
+ OPENSSL_memcpy(&v, in, sizeof(v));
+
+ uint16_t sums = (v & 0x5555) + ((v >> 1) & 0x5555);
+ for (unsigned j = 0; j < 4; j++) {
+ p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30;
+ sums >>= 4;
+ }
+
+ out->v[N - 1] = 0;
+}
+
+// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST],
+// section 1.8.2.
+static void poly_short_sample_plus(struct poly *out,
+ const uint8_t in[HRSS_SAMPLE_BYTES]) {
+ poly_short_sample(out, in);
+
+ // sum (and the product in the for loop) will overflow. But that's fine
+ // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out.
+ uint16_t sum = 0;
+ for (unsigned i = 0; i < N - 2; i++) {
+ sum += (unsigned) out->v[i] * out->v[i + 1];
+ }
+
+ // If the sum is negative, flip the sign of even-positioned coefficients. (See
+ // page 8 of [HRSS].)
+ sum = ((int16_t) sum) >> 15;
+ const uint16_t scale = sum | (~sum & 1);
+ for (unsigned i = 0; i < N; i += 2) {
+ out->v[i] = (unsigned) out->v[i] * scale;
+ }
+}
+
+// poly_lift computes the function discussed in [HRSS], appendix B.
+static void poly_lift(struct poly *out, const struct poly *a) {
+ // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
+ // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
+
+ // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
+ //
+ // R.<x> = PolynomialRing(GF(3)…)
+ // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
+ // list(inv)[:15]
+ // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
+ //
+ // This three-element pattern of coefficients repeats for the whole
+ // polynomial.
+ //
+ // Next define the overbar operator such that z̅ = z[0] +
+ // reverse(z[1:]). (Index zero of a polynomial here is the coefficient
+ // of the constant term. So index one is the coefficient of 𝑥 and so
+ // on.)
+ //
+ // A less odd way to define this is to see that z̅ negates the indexes,
+ // so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
+ //
+ // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
+ // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
+ // of the point-wise products.) Although we calculated the inverse mod
+ // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
+ // (That's because (𝑥^N - 1) is a multiple of Φ(N).)
+ //
+ // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
+ // of the list of coefficients.
+ //
+ // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
+ //
+ // def reverse(xs):
+ // suffix = list(xs[1:])
+ // suffix.reverse()
+ // return [xs[0]] + suffix
+ //
+ // def rotate(xs):
+ // return [xs[-1]] + xs[:-1]
+ //
+ // zoverbar = reverse(list(inv) + [0])
+ // xzoverbar = rotate(reverse(list(inv) + [0]))
+ // x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
+ //
+ // zoverbar[:15]
+ // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
+ // xzoverbar[:15]
+ // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
+ // x2zoverbar[:15]
+ // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
+ //
+ // (For a formula for z̅, see lemma two of appendix B.)
+ //
+ // After the first three elements have been taken care of, all then have
+ // a repeating three-element cycle. The next value (𝑥^3z̅) involves
+ // three rotations of the first pattern, thus the three-element cycle
+ // lines up. However, the discontinuity in the first three elements
+ // obviously moves to a different position. Consider the difference
+ // between 𝑥^3z̅ and z̅:
+ //
+ // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
+ // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ //
+ // This pattern of differences is the same for all elements, although it
+ // obviously moves right with the rotations.
+ //
+ // From this, we reach algorithm eight of appendix B.
+
+ // Handle the first three elements of the inner products.
+ out->v[0] = a->v[0] + a->v[2];
+ out->v[1] = a->v[1];
+ out->v[2] = -a->v[0] + a->v[2];
+
+ // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2],
+ // respectively. We do not compute s1 because it's just -(s0 + s1).
+ uint16_t s0 = 0, s2 = 0;
+ for (size_t i = 3; i < 699; i += 3) {
+ s0 += -a->v[i] + a->v[i + 2];
+ // s1 += a->v[i] - a->v[i + 1];
+ s2 += a->v[i + 1] - a->v[i + 2];
+ }
+
+ // Handle the fact that the three-element pattern doesn't fill the
+ // polynomial exactly (since 701 isn't a multiple of three).
+ s0 -= a->v[699];
+ // s1 += a->v[699] - a->v[700];
+ s2 += a->v[700];
+
+ // Note that s0 + s1 + s2 = 0.
+ out->v[0] += s0;
+ out->v[1] -= (s0 + s2); // = s1
+ out->v[2] += s2;
+
+ // Calculate the remaining inner products by taking advantage of the
+ // fact that the pattern repeats every three cycles and the pattern of
+ // differences moves with the rotation.
+ for (size_t i = 3; i < N; i++) {
+ out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i]));
+ }
+
+ // Reduce mod Φ(N) by subtracting a multiple of out[700] from every
+ // element and convert to mod Q. (See above about adding twice as
+ // subtraction.)
+ const crypto_word_t v = out->v[700];
+ for (unsigned i = 0; i < N; i++) {
+ const uint16_t vi_mod3 = mod3(out->v[i] - v);
+ // Map {0, 1, 2} to {0, 1, 0xffff}.
+ out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3;
+ }
+
+ poly_mul_x_minus_1(out);
+}
+
+struct public_key {
+ struct poly ph;
+};
+
+struct private_key {
+ struct poly3 f, f_inverse;
+ struct poly ph_inverse;
+ uint8_t hmac_key[32];
+};
+
+// public_key_from_external converts an external public key pointer into an
+// internal one. Externally the alignment is only specified to be eight bytes
+// but we need 16-byte alignment. We could annotate the external struct with
+// that alignment but we can only assume that malloced pointers are 8-byte
+// aligned in any case. (Even if the underlying malloc returns values with
+// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess
+// that up.)
+static struct public_key *public_key_from_external(
+ struct HRSS_public_key *ext) {
+ OPENSSL_STATIC_ASSERT(
+ sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15,
+ "HRSS public key too small");
+
+ uintptr_t p = (uintptr_t)ext;
+ p = (p + 15) & ~15;
+ return (struct public_key *)p;
+}
+
+// private_key_from_external does the same thing as |public_key_from_external|,
+// but for private keys. See the comment on that function about alignment
+// issues.
+static struct private_key *private_key_from_external(
+ struct HRSS_private_key *ext) {
+ OPENSSL_STATIC_ASSERT(
+ sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15,
+ "HRSS private key too small");
+
+ uintptr_t p = (uintptr_t)ext;
+ p = (p + 15) & ~15;
+ return (struct private_key *)p;
+}
+
+void HRSS_generate_key(
+ struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
+ const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) {
+ struct public_key *pub = public_key_from_external(out_pub);
+ struct private_key *priv = private_key_from_external(out_priv);
+
+ OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES,
+ sizeof(priv->hmac_key));
+
+ struct poly f;
+ poly_short_sample_plus(&f, in);
+ poly3_from_poly(&priv->f, &f);
+ HRSS_poly3_invert(&priv->f_inverse, &priv->f);
+
+ // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1).
+ struct poly pg_phi1;
+ poly_short_sample_plus(&pg_phi1, in + HRSS_SAMPLE_BYTES);
+ for (unsigned i = 0; i < N; i++) {
+ pg_phi1.v[i] *= 3;
+ }
+ poly_mul_x_minus_1(&pg_phi1);
+
+ struct poly pfg_phi1;
+ poly_mul(&pfg_phi1, &f, &pg_phi1);
+
+ struct poly pfg_phi1_inverse;
+ poly_invert(&pfg_phi1_inverse, &pfg_phi1);
+
+ poly_mul(&pub->ph, &pfg_phi1_inverse, &pg_phi1);
+ poly_mul(&pub->ph, &pub->ph, &pg_phi1);
+ poly_clamp(&pub->ph);
+
+ poly_mul(&priv->ph_inverse, &pfg_phi1_inverse, &f);
+ poly_mul(&priv->ph_inverse, &priv->ph_inverse, &f);
+ poly_clamp(&priv->ph_inverse);
+}
+
+static void owf(uint8_t out[POLY_BYTES], const struct public_key *pub,
+ const struct poly *m_lifted, const struct poly *r) {
+ struct poly prh_plus_m;
+ poly_mul(&prh_plus_m, r, &pub->ph);
+ for (unsigned i = 0; i < N; i++) {
+ prh_plus_m.v[i] += m_lifted->v[i];
+ }
+
+ poly_marshal(out, &prh_plus_m);
+}
+
+static const char kSharedKey[] = "shared key";
+
+void HRSS_encap(uint8_t out_ciphertext[POLY_BYTES],
+ uint8_t out_shared_key[32],
+ const struct HRSS_public_key *in_pub,
+ const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) {
+ const struct public_key *pub =
+ public_key_from_external((struct HRSS_public_key *)in_pub);
+ struct poly m, r, m_lifted;
+ poly_short_sample(&m, in);
+ poly_short_sample(&r, in + HRSS_SAMPLE_BYTES);
+ poly_lift(&m_lifted, &m);
+ owf(out_ciphertext, pub, &m_lifted, &r);
+
+ uint8_t m_bytes[HRSS_POLY3_BYTES], r_bytes[HRSS_POLY3_BYTES];
+ poly_marshal_mod3(m_bytes, &m);
+ poly_marshal_mod3(r_bytes, &r);
+
+ SHA256_CTX hash_ctx;
+ SHA256_Init(&hash_ctx);
+ SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey));
+ SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+ SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+ SHA256_Update(&hash_ctx, out_ciphertext, POLY_BYTES);
+ SHA256_Final(out_shared_key, &hash_ctx);
+}
+
+void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
+ const struct HRSS_public_key *in_pub,
+ const struct HRSS_private_key *in_priv,
+ const uint8_t *ciphertext, size_t ciphertext_len) {
+ const struct public_key *pub =
+ public_key_from_external((struct HRSS_public_key *)in_pub);
+ const struct private_key *priv =
+ private_key_from_external((struct HRSS_private_key *)in_priv);
+
+ // This is HMAC, expanded inline rather than using the |HMAC| function so that
+ // we can avoid dealing with possible allocation failures and so keep this
+ // function infallible.
+ uint8_t masked_key[SHA256_CBLOCK];
+ OPENSSL_STATIC_ASSERT(sizeof(priv->hmac_key) <= sizeof(masked_key),
+ "HRSS HMAC key larger than SHA-256 block size");
+ for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
+ masked_key[i] = priv->hmac_key[i] ^ 0x36;
+ }
+ OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x36,
+ sizeof(masked_key) - sizeof(priv->hmac_key));
+
+ SHA256_CTX hash_ctx;
+ SHA256_Init(&hash_ctx);
+ SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key));
+ SHA256_Update(&hash_ctx, ciphertext, ciphertext_len);
+ uint8_t inner_digest[SHA256_DIGEST_LENGTH];
+ SHA256_Final(inner_digest, &hash_ctx);
+
+ for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
+ masked_key[i] ^= (0x5c ^ 0x36);
+ }
+ OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x5c,
+ sizeof(masked_key) - sizeof(priv->hmac_key));
+
+ SHA256_Init(&hash_ctx);
+ SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key));
+ SHA256_Update(&hash_ctx, inner_digest, sizeof(inner_digest));
+ OPENSSL_STATIC_ASSERT(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH,
+ "HRSS shared key length incorrect");
+ SHA256_Final(out_shared_key, &hash_ctx);
+
+ // If the ciphertext is publicly invalid then a random shared key is still
+ // returned to simply the logic of the caller, but this path is not constant
+ // time.
+ if (ciphertext_len != HRSS_CIPHERTEXT_BYTES) {
+ return;
+ }
+
+ struct poly c;
+ poly_unmarshal(&c, ciphertext);
+
+ struct poly f;
+ poly_from_poly3(&f, &priv->f);
+
+ struct poly cf;
+ poly_mul(&cf, &c, &f);
+
+ struct poly3 cf3;
+ poly3_from_poly(&cf3, &cf);
+ // Note that cf3 is not reduced mod Φ(N). That reduction is deferred.
+
+ struct poly3 m3;
+ HRSS_poly3_mul(&m3, &cf3, &priv->f_inverse);
+
+ struct poly m, m_lifted;
+ poly_from_poly3(&m, &m3);
+ poly_lift(&m_lifted, &m);
+
+ for (unsigned i = 0; i < N; i++) {
+ c.v[i] -= m_lifted.v[i];
+ }
+ poly_mul(&c, &c, &priv->ph_inverse);
+ poly_mod_phiN(&c);
+ poly_clamp(&c);
+
+ struct poly3 r3;
+ crypto_word_t ok = poly3_from_poly_checked(&r3, &c);
+
+ uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES];
+ OPENSSL_STATIC_ASSERT(HRSS_CIPHERTEXT_BYTES == POLY_BYTES,
+ "ciphertext is the wrong size");
+ assert(ciphertext_len == sizeof(expected_ciphertext));
+ owf(expected_ciphertext, pub, &m_lifted, &c);
+
+ uint8_t m_bytes[HRSS_POLY3_BYTES];
+ uint8_t r_bytes[HRSS_POLY3_BYTES];
+ poly_marshal_mod3(m_bytes, &m);
+ poly_marshal_mod3(r_bytes, &c);
+
+ ok &= constant_time_is_zero_w(CRYPTO_memcmp(ciphertext, expected_ciphertext,
+ sizeof(expected_ciphertext)));
+
+ uint8_t shared_key[32];
+ SHA256_Init(&hash_ctx);
+ SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey));
+ SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+ SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+ SHA256_Update(&hash_ctx, expected_ciphertext, sizeof(expected_ciphertext));
+ SHA256_Final(shared_key, &hash_ctx);
+
+ for (unsigned i = 0; i < sizeof(shared_key); i++) {
+ out_shared_key[i] =
+ constant_time_select_8(ok, shared_key[i], out_shared_key[i]);
+ }
+}
+
+void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES],
+ const struct HRSS_public_key *in_pub) {
+ const struct public_key *pub =
+ public_key_from_external((struct HRSS_public_key *)in_pub);
+ poly_marshal(out, &pub->ph);
+}
+
+int HRSS_parse_public_key(struct HRSS_public_key *out,
+ const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) {
+ struct public_key *pub = public_key_from_external(out);
+ poly_unmarshal(&pub->ph, in);
+ OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t));
+ return 1;
+}
diff --git a/src/crypto/hrss/hrss_test.cc b/src/crypto/hrss/hrss_test.cc
new file mode 100644
index 00000000..ead717d6
--- /dev/null
+++ b/src/crypto/hrss/hrss_test.cc
@@ -0,0 +1,472 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <gtest/gtest.h>
+
+#include <openssl/hrss.h>
+#include <openssl/rand.h>
+
+#include "../test/test_util.h"
+#include "internal.h"
+
+// poly2_from_bits takes the least-significant bit from each byte of |in| and
+// sets the bits of |*out| to match.
+static void poly2_from_bits(struct poly2 *out, const uint8_t in[N]) {
+ crypto_word_t *words = out->v;
+ unsigned shift = 0;
+ crypto_word_t word = 0;
+
+ for (unsigned i = 0; i < N; i++) {
+ word >>= 1;
+ word |= (crypto_word_t)(in[i] & 1) << (BITS_PER_WORD - 1);
+ shift++;
+
+ if (shift == BITS_PER_WORD) {
+ *words = word;
+ words++;
+ word = 0;
+ shift = 0;
+ }
+ }
+
+ word >>= BITS_PER_WORD - shift;
+ *words = word;
+}
+
+TEST(HRSS, Poly2RotateRight) {
+ uint8_t bits[N];
+ RAND_bytes(bits, sizeof(bits));
+ for (size_t i = 0; i < N; i++) {
+ bits[i] &= 1;
+ };
+
+ struct poly2 p, orig, shifted;
+ poly2_from_bits(&p, bits);
+ OPENSSL_memcpy(&orig, &p, sizeof(orig));
+
+ // Test |HRSS_poly2_rotr_consttime| by manually rotating |bits| step-by-step
+ // and testing every possible shift to ensure that it produces the correct
+ // answer.
+ for (size_t shift = 0; shift <= N; shift++) {
+ SCOPED_TRACE(shift);
+
+ OPENSSL_memcpy(&p, &orig, sizeof(orig));
+ HRSS_poly2_rotr_consttime(&p, shift);
+ poly2_from_bits(&shifted, bits);
+ ASSERT_EQ(
+ Bytes(reinterpret_cast<const uint8_t *>(&shifted), sizeof(shifted)),
+ Bytes(reinterpret_cast<const uint8_t *>(&p), sizeof(p)));
+
+ const uint8_t least_significant_bit = bits[0];
+ OPENSSL_memmove(bits, &bits[1], N-1);
+ bits[N-1] = least_significant_bit;
+ }
+}
+
+// poly3_rand sets |r| to a random value (albeit with bias).
+static void poly3_rand(poly3 *p) {
+ RAND_bytes(reinterpret_cast<uint8_t *>(p), sizeof(poly3));
+ p->s.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+ p->a.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+ // (s, a) = (1, 1) is invalid. Map those to one.
+ for (size_t j = 0; j < WORDS_PER_POLY; j++) {
+ p->s.v[j] ^= p->s.v[j] & p->a.v[j];
+ }
+}
+
+// poly3_word_add sets (|s1|, |a1|) += (|s2|, |a2|).
+static void poly3_word_add(crypto_word_t *s1, crypto_word_t *a1,
+ const crypto_word_t s2, const crypto_word_t a2) {
+ const crypto_word_t x = *a1 ^ a2;
+ const crypto_word_t y = (*s1 ^ s2) ^ (*a1 & a2);
+ const crypto_word_t z = *s1 & s2;
+ *s1 = y & ~x;
+ *a1 = z | (x & ~y);
+}
+
+TEST(HRSS, Poly3Invert) {
+ poly3 p, inverse, result;
+ memset(&p, 0, sizeof(p));
+ memset(&inverse, 0, sizeof(inverse));
+ memset(&result, 0, sizeof(result));
+
+ // The inverse of -1 is -1.
+ p.s.v[0] = 1;
+ HRSS_poly3_invert(&inverse, &p);
+ EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t*>(&p), sizeof(p)),
+ Bytes(reinterpret_cast<const uint8_t*>(&inverse), sizeof(inverse)));
+
+ // The inverse of 1 is 1.
+ p.s.v[0] = 0;
+ p.a.v[0] = 1;
+ HRSS_poly3_invert(&inverse, &p);
+ EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t*>(&p), sizeof(p)),
+ Bytes(reinterpret_cast<const uint8_t*>(&inverse), sizeof(inverse)));
+
+ for (size_t i = 0; i < 500; i++) {
+ poly3 r;
+ poly3_rand(&r);
+ HRSS_poly3_invert(&inverse, &r);
+ HRSS_poly3_mul(&result, &inverse, &r);
+ // r×r⁻¹ = 1, and |p| contains 1.
+ EXPECT_EQ(
+ Bytes(reinterpret_cast<const uint8_t *>(&p), sizeof(p)),
+ Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result)));
+ }
+}
+
+TEST(HRSS, Poly3UnreducedInput) {
+ // Check that |poly3_mul| works correctly with inputs that aren't reduced mod
+ // Φ(N).
+ poly3 r, inverse, result, one;
+ poly3_rand(&r);
+ HRSS_poly3_invert(&inverse, &r);
+ HRSS_poly3_mul(&result, &inverse, &r);
+
+ memset(&one, 0, sizeof(one));
+ one.a.v[0] = 1;
+ EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t *>(&one), sizeof(one)),
+ Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result)));
+
+ // |r| is probably already not reduced mod Φ(N), but add x^701 - 1 and
+ // recompute to ensure that we get the same answer. (Since (x^701 - 1) ≡ 0 mod
+ // Φ(N).)
+ poly3_word_add(&r.s.v[0], &r.a.v[0], 1, 0);
+ poly3_word_add(&r.s.v[WORDS_PER_POLY - 1], &r.a.v[WORDS_PER_POLY - 1], 0,
+ UINT64_C(1) << BITS_IN_LAST_WORD);
+
+ HRSS_poly3_mul(&result, &inverse, &r);
+ EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t *>(&one), sizeof(one)),
+ Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result)));
+
+ // Check that x^700 × 1 gives -x^699 - x^698 … -1.
+ poly3 x700;
+ memset(&x700, 0, sizeof(x700));
+ x700.a.v[WORDS_PER_POLY-1] = UINT64_C(1) << (BITS_IN_LAST_WORD - 1);
+ HRSS_poly3_mul(&result, &one, &x700);
+
+ for (size_t i = 0; i < WORDS_PER_POLY-1; i++) {
+ EXPECT_EQ(CONSTTIME_TRUE_W, result.s.v[i]);
+ EXPECT_EQ(0u, result.a.v[i]);
+ }
+ EXPECT_EQ((UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1,
+ result.s.v[WORDS_PER_POLY - 1]);
+ EXPECT_EQ(0u, result.a.v[WORDS_PER_POLY - 1]);
+}
+
+TEST(HRSS, Basic) {
+ uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES];
+ for (unsigned i = 0; i < sizeof(generate_key_entropy); i++) {
+ generate_key_entropy[i] = i;
+ }
+
+ HRSS_public_key pub;
+ HRSS_private_key priv;
+ HRSS_generate_key(&pub, &priv, generate_key_entropy);
+
+ uint8_t encap_entropy[HRSS_ENCAP_BYTES];
+ for (unsigned i = 0; i < sizeof(encap_entropy); i++) {
+ encap_entropy[i] = i;
+ }
+
+ uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+ uint8_t shared_key[HRSS_KEY_BYTES];
+ HRSS_encap(ciphertext, shared_key, &pub, encap_entropy);
+
+ HRSS_public_key pub2;
+ uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES];
+ HRSS_marshal_public_key(pub_bytes, &pub);
+ ASSERT_TRUE(HRSS_parse_public_key(&pub2, pub_bytes));
+
+ uint8_t shared_key2[HRSS_KEY_BYTES];
+ HRSS_decap(shared_key2, &pub2, &priv, ciphertext, sizeof(ciphertext));
+
+ EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2));
+}
+
+TEST(HRSS, Random) {
+ for (unsigned i = 0; i < 10; i++) {
+ uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES];
+ RAND_bytes(generate_key_entropy, sizeof(generate_key_entropy));
+ SCOPED_TRACE(Bytes(generate_key_entropy));
+
+ HRSS_public_key pub;
+ HRSS_private_key priv;
+ HRSS_generate_key(&pub, &priv, generate_key_entropy);
+
+ for (unsigned j = 0; j < 10; j++) {
+ uint8_t encap_entropy[HRSS_ENCAP_BYTES];
+ RAND_bytes(encap_entropy, sizeof(encap_entropy));
+ SCOPED_TRACE(Bytes(generate_key_entropy));
+
+ uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+ uint8_t shared_key[HRSS_KEY_BYTES];
+ HRSS_encap(ciphertext, shared_key, &pub, encap_entropy);
+
+ uint8_t shared_key2[HRSS_KEY_BYTES];
+ HRSS_decap(shared_key2, &pub, &priv, ciphertext, sizeof(ciphertext));
+
+ EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2));
+ }
+ }
+}
+
+TEST(HRSS, Golden) {
+ uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES];
+ for (unsigned i = 0; i < HRSS_SAMPLE_BYTES; i++) {
+ generate_key_entropy[i] = i;
+ }
+ for (unsigned i = HRSS_SAMPLE_BYTES; i < 2 * HRSS_SAMPLE_BYTES; i++) {
+ generate_key_entropy[i] = 2 + i;
+ }
+ for (unsigned i = 2 * HRSS_SAMPLE_BYTES; i < sizeof(generate_key_entropy);
+ i++) {
+ generate_key_entropy[i] = 4 + i;
+ }
+
+ HRSS_public_key pub;
+ HRSS_private_key priv;
+ OPENSSL_memset(&pub, 0, sizeof(pub));
+ OPENSSL_memset(&priv, 0, sizeof(priv));
+ HRSS_generate_key(&pub, &priv, generate_key_entropy);
+
+ static const uint8_t kExpectedPub[HRSS_PUBLIC_KEY_BYTES] = {
+ 0xf8, 0x9f, 0xa0, 0xfc, 0xf1, 0xd4, 0xfa, 0x4d, 0x8f, 0x35, 0x28, 0x73,
+ 0x0e, 0x37, 0x18, 0x1d, 0x09, 0xf3, 0x9e, 0x16, 0x0d, 0x7f, 0x9c, 0x82,
+ 0x17, 0xa1, 0xa1, 0x88, 0x6b, 0x29, 0x5b, 0x3a, 0x30, 0xcd, 0x6f, 0x8e,
+ 0x0c, 0xd3, 0x38, 0x0c, 0x05, 0x68, 0x6e, 0x4c, 0xcc, 0x20, 0xd4, 0x06,
+ 0x77, 0x0c, 0xac, 0x1c, 0x49, 0x14, 0x00, 0xd6, 0x9b, 0x1c, 0xde, 0x43,
+ 0x0a, 0x59, 0x37, 0xd6, 0x46, 0x68, 0x1f, 0x04, 0xcb, 0x73, 0x92, 0x37,
+ 0x2d, 0x7f, 0x57, 0x70, 0x16, 0xe8, 0x06, 0x48, 0x3b, 0x66, 0xb3, 0x63,
+ 0x02, 0x5a, 0x71, 0x46, 0xdd, 0xa4, 0xee, 0xb8, 0x78, 0x44, 0xfd, 0x9e,
+ 0xd0, 0x71, 0x16, 0x00, 0xbd, 0x01, 0x1e, 0x27, 0x2e, 0xa0, 0xc6, 0x8d,
+ 0x55, 0x89, 0x7c, 0x2a, 0x01, 0x2b, 0x1b, 0x75, 0xa2, 0xc2, 0xd1, 0x5a,
+ 0x67, 0xfa, 0xdd, 0x3b, 0x70, 0x9d, 0xdb, 0xcd, 0x73, 0x32, 0x5e, 0x24,
+ 0xb1, 0xcf, 0x23, 0xbe, 0x3c, 0x56, 0xcc, 0xbe, 0x61, 0xdb, 0xe7, 0x3c,
+ 0xc7, 0xf5, 0x09, 0xe6, 0x87, 0xa0, 0x09, 0x52, 0x9d, 0x61, 0x5b, 0xc6,
+ 0xd4, 0xc5, 0x2e, 0xc2, 0x6c, 0x87, 0x30, 0x36, 0x49, 0x6f, 0x04, 0xaa,
+ 0xb3, 0x26, 0xd5, 0x63, 0xcf, 0xd4, 0x74, 0x1e, 0xc7, 0x79, 0xb3, 0xfc,
+ 0x8c, 0x41, 0x36, 0x79, 0xaa, 0xd5, 0xba, 0x64, 0x49, 0x48, 0xdb, 0xeb,
+ 0xe8, 0x33, 0x7d, 0xbe, 0x3b, 0x67, 0xd7, 0xfd, 0x93, 0x1e, 0x80, 0x8d,
+ 0x17, 0xab, 0x6f, 0xfd, 0x1c, 0x4b, 0x2d, 0x5b, 0x90, 0xf0, 0xf0, 0x5d,
+ 0xbe, 0x8f, 0x81, 0x18, 0x29, 0x08, 0x9a, 0x47, 0x1b, 0xc2, 0x2d, 0xa2,
+ 0x22, 0x5a, 0x4f, 0xe9, 0x81, 0x64, 0xdd, 0x53, 0x2e, 0x67, 0xe5, 0x07,
+ 0x1a, 0xf0, 0x0c, 0x54, 0x9b, 0xe2, 0xf8, 0xe6, 0xb3, 0xb6, 0xe0, 0x5a,
+ 0x74, 0xfa, 0x8d, 0x9c, 0xa5, 0x7c, 0x6e, 0x73, 0xba, 0xee, 0x6e, 0x6e,
+ 0x31, 0xcb, 0x59, 0xd7, 0xfd, 0x94, 0x1c, 0x4d, 0x62, 0xc6, 0x87, 0x0b,
+ 0x38, 0x54, 0xc6, 0x35, 0xac, 0xc8, 0x8c, 0xc0, 0xd9, 0x99, 0xee, 0xfc,
+ 0xa9, 0xde, 0xc4, 0x50, 0x88, 0x8e, 0x24, 0xf6, 0xd6, 0x04, 0x54, 0x3e,
+ 0x81, 0xc4, 0x96, 0x9a, 0x40, 0xe5, 0xef, 0x8b, 0xec, 0x41, 0x50, 0x1d,
+ 0x14, 0xae, 0xa4, 0x5a, 0xac, 0xd4, 0x73, 0x31, 0xc3, 0x1d, 0xc1, 0x96,
+ 0x89, 0xd8, 0x62, 0x97, 0x60, 0x3f, 0x58, 0x2a, 0x5f, 0xcf, 0xcb, 0x26,
+ 0x99, 0x69, 0x81, 0x13, 0x9c, 0xaf, 0x17, 0x91, 0xa8, 0xeb, 0x9a, 0xf9,
+ 0xd3, 0x83, 0x47, 0x66, 0xc7, 0xf8, 0xd8, 0xe3, 0xd2, 0x7e, 0x58, 0xa9,
+ 0xf5, 0xb2, 0x03, 0xbe, 0x7e, 0xa5, 0x29, 0x9d, 0xff, 0xd1, 0xd8, 0x55,
+ 0x39, 0xc7, 0x2c, 0xce, 0x03, 0x64, 0xdc, 0x18, 0xe7, 0xb0, 0x60, 0x46,
+ 0x26, 0xeb, 0xb7, 0x61, 0x4b, 0x91, 0x2c, 0xd8, 0xa2, 0xee, 0x63, 0x2e,
+ 0x15, 0x0a, 0x58, 0x88, 0x04, 0xb1, 0xed, 0x6d, 0xf1, 0x5c, 0xc7, 0xee,
+ 0x60, 0x38, 0x26, 0xc9, 0x31, 0x7e, 0x69, 0xe4, 0xac, 0x3c, 0x72, 0x09,
+ 0x3e, 0xe6, 0x24, 0x30, 0x44, 0x6e, 0x66, 0x83, 0xb9, 0x2a, 0x22, 0xaf,
+ 0x26, 0x1e, 0xaa, 0xa3, 0xf4, 0xb1, 0xa1, 0x5c, 0xfa, 0x5f, 0x0d, 0x71,
+ 0xac, 0xe3, 0xe0, 0xc3, 0xdd, 0x4f, 0x96, 0x57, 0x8b, 0x58, 0xac, 0xe3,
+ 0x42, 0x8e, 0x47, 0x72, 0xb1, 0xe4, 0x19, 0x68, 0x3e, 0xbb, 0x19, 0x14,
+ 0xdf, 0x16, 0xb5, 0xde, 0x7f, 0x37, 0xaf, 0xd8, 0xd3, 0x3d, 0x6a, 0x16,
+ 0x1b, 0x26, 0xd3, 0xcc, 0x53, 0x82, 0x57, 0x90, 0x89, 0xc5, 0x7e, 0x6d,
+ 0x7e, 0x99, 0x5b, 0xcd, 0xd3, 0x18, 0xbb, 0x89, 0xef, 0x76, 0xbd, 0xd2,
+ 0x62, 0xf0, 0xe8, 0x25, 0x2a, 0x8d, 0xe2, 0x21, 0xea, 0xde, 0x6e, 0xa5,
+ 0xa4, 0x3d, 0x58, 0xee, 0xdf, 0x90, 0xc1, 0xa1, 0x38, 0x5d, 0x11, 0x50,
+ 0xb5, 0xac, 0x9d, 0xb4, 0xfd, 0xef, 0x53, 0xe8, 0xc0, 0x17, 0x6c, 0x4f,
+ 0x31, 0xe0, 0xcc, 0x8f, 0x80, 0x7a, 0x84, 0x14, 0xde, 0xee, 0xec, 0xdd,
+ 0x6a, 0xad, 0x29, 0x65, 0xa5, 0x72, 0xc3, 0x73, 0x5f, 0xe3, 0x6f, 0x60,
+ 0xb1, 0xfb, 0x0f, 0xaa, 0xc6, 0xda, 0x53, 0x4a, 0xb1, 0x92, 0x2a, 0xb7,
+ 0x02, 0xbe, 0xf9, 0xdf, 0x37, 0x16, 0xe7, 0x5c, 0x38, 0x0b, 0x3c, 0xe2,
+ 0xdd, 0x90, 0xb8, 0x7b, 0x48, 0x69, 0x79, 0x81, 0xc5, 0xae, 0x9a, 0x0d,
+ 0x78, 0x95, 0x52, 0x63, 0x80, 0xda, 0x46, 0x69, 0x20, 0x57, 0x9b, 0x27,
+ 0xe2, 0xe8, 0xbd, 0x2f, 0x45, 0xe6, 0x46, 0x40, 0xae, 0x50, 0xd5, 0xa2,
+ 0x53, 0x93, 0xe1, 0x99, 0xfd, 0x13, 0x7c, 0xf6, 0x22, 0xc4, 0x6c, 0xab,
+ 0xe3, 0xc9, 0x55, 0x0a, 0x16, 0x67, 0x68, 0x26, 0x6b, 0xd6, 0x7d, 0xde,
+ 0xd3, 0xae, 0x71, 0x32, 0x02, 0xf1, 0x27, 0x67, 0x47, 0x74, 0xd9, 0x40,
+ 0x35, 0x1d, 0x25, 0x72, 0x32, 0xdf, 0x75, 0xd5, 0x60, 0x26, 0xab, 0x90,
+ 0xfa, 0xeb, 0x26, 0x11, 0x4b, 0xb4, 0xc5, 0xc2, 0x3e, 0xa9, 0x23, 0x3a,
+ 0x4e, 0x6a, 0xb1, 0xbb, 0xb3, 0xea, 0xf9, 0x1e, 0xe4, 0x10, 0xf5, 0xdc,
+ 0x35, 0xde, 0xb5, 0xee, 0xf0, 0xde, 0xa1, 0x18, 0x80, 0xc7, 0x13, 0x68,
+ 0x46, 0x94, 0x0e, 0x2a, 0x8e, 0xf8, 0xe9, 0x26, 0x84, 0x42, 0x0f, 0x56,
+ 0xed, 0x67, 0x7f, 0xeb, 0x7d, 0x35, 0x07, 0x01, 0x11, 0x81, 0x8b, 0x56,
+ 0x88, 0xc6, 0x58, 0x61, 0x65, 0x3c, 0x5d, 0x9c, 0x58, 0x25, 0xd6, 0xdf,
+ 0x4e, 0x3b, 0x93, 0xbf, 0x82, 0xe1, 0x19, 0xb8, 0xda, 0xde, 0x26, 0x38,
+ 0xf2, 0xd9, 0x95, 0x24, 0x98, 0xde, 0x58, 0xf7, 0x0c, 0xe9, 0x32, 0xbb,
+ 0xcc, 0xf7, 0x92, 0x69, 0xa2, 0xf0, 0xc3, 0xfa, 0xd2, 0x31, 0x8b, 0x43,
+ 0x4e, 0x03, 0xe2, 0x13, 0x79, 0x6e, 0x73, 0x63, 0x3b, 0x45, 0xde, 0x80,
+ 0xf4, 0x26, 0xb1, 0x38, 0xed, 0x62, 0x55, 0xc6, 0x6a, 0x67, 0x00, 0x2d,
+ 0xba, 0xb2, 0xc5, 0xb6, 0x97, 0x62, 0x28, 0x64, 0x30, 0xb9, 0xfb, 0x3f,
+ 0x94, 0x03, 0x48, 0x36, 0x2c, 0x5d, 0xfd, 0x08, 0x96, 0x40, 0xd1, 0x6c,
+ 0xe5, 0xd0, 0xf8, 0x99, 0x40, 0x82, 0x87, 0xd7, 0xdc, 0x2f, 0x8b, 0xaa,
+ 0x31, 0x96, 0x0a, 0x34, 0x33, 0xa6, 0xf1, 0x84, 0x6e, 0x33, 0x73, 0xc5,
+ 0xe3, 0x26, 0xad, 0xd0, 0xcb, 0x62, 0x71, 0x82, 0xab, 0xd1, 0x82, 0x33,
+ 0xe6, 0xca, 0xd0, 0x3e, 0xf5, 0x4d, 0x12, 0x6e, 0xf1, 0x83, 0xbd, 0xdc,
+ 0x4d, 0xdf, 0x49, 0xbc, 0x63, 0xae, 0x7e, 0x59, 0xe8, 0x3c, 0x0d, 0xd6,
+ 0x1d, 0x41, 0x89, 0x72, 0x52, 0xc0, 0xae, 0xd1, 0x2f, 0x0a, 0x8a, 0xce,
+ 0x26, 0xd0, 0x3e, 0x0c, 0x71, 0x32, 0x52, 0xb2, 0xe4, 0xee, 0xa2, 0xe5,
+ 0x28, 0xb6, 0x33, 0x69, 0x97, 0x5a, 0x53, 0xdb, 0x56, 0x63, 0xe9, 0xb3,
+ 0x6d, 0x60, 0xf4, 0x7a, 0xce, 0xec, 0x36, 0x65, 0xd5, 0xca, 0x63, 0x2a,
+ 0x19, 0x90, 0x14, 0x7b, 0x02, 0x33, 0xfa, 0x11, 0x58, 0x5a, 0xd9, 0xc5,
+ 0x54, 0xf3, 0x28, 0xd5, 0x6e, 0xea, 0x85, 0xf5, 0x09, 0xbb, 0x81, 0x44,
+ 0x1c, 0x63, 0x66, 0x81, 0xc5, 0x96, 0x2d, 0x7c, 0x0e, 0x75, 0x7b, 0xb4,
+ 0x7e, 0x4e, 0x0c, 0xfd, 0x3c, 0xc5, 0x5a, 0x22, 0x85, 0x5c, 0xc8, 0xf3,
+ 0x97, 0x98, 0x2c, 0xe9, 0x46, 0xb4, 0x02, 0xcf, 0x7d, 0xa4, 0xf2, 0x44,
+ 0x7a, 0x89, 0x71, 0xa0, 0xfa, 0xb6, 0xa3, 0xaf, 0x13, 0x25, 0x46, 0xe2,
+ 0x64, 0xe3, 0x69, 0xba, 0xf9, 0x68, 0x5c, 0xc0, 0xb7, 0xa8, 0xa6, 0x4b,
+ 0xe1, 0x42, 0xe9, 0xb5, 0xc7, 0x84, 0xbb, 0xa6, 0x4b, 0x10, 0x4e, 0xd4,
+ 0x68, 0x70, 0x0a, 0x75, 0x2a, 0xbb, 0x9d, 0xa0, 0xcb, 0xf0, 0x36, 0x4c,
+ 0x70, 0x6c, 0x60, 0x4d, 0xfe, 0xe8, 0xc8, 0x66, 0x80, 0x1b, 0xf7, 0xcc,
+ 0x1a, 0xdd, 0x6b, 0xa7, 0xa7, 0x25, 0x61, 0x0c, 0x31, 0xf0, 0x34, 0x63,
+ 0x00, 0x0e, 0x48, 0x6a, 0x5a, 0x8d, 0x47, 0x94, 0x3f, 0x14, 0x16, 0xa8,
+ 0x8a, 0x49, 0xbb, 0x0c, 0x43, 0x21, 0xda, 0xf2, 0xc5, 0xd0, 0xff, 0x19,
+ 0x3e, 0x36, 0x64, 0x20, 0xb3, 0x70, 0xae, 0x54, 0xca, 0x73, 0x05, 0x56,
+ 0x7a, 0x49, 0x45, 0xe9, 0x46, 0xbc, 0xc2, 0x61, 0x70, 0x40, 0x7c, 0xb0,
+ 0xf7, 0xea, 0xc0, 0xd1, 0xb0, 0x77, 0x2c, 0xc7, 0xdd, 0x88, 0xcb, 0x9d,
+ 0xea, 0x55, 0x6c, 0x5c, 0x28, 0xb8, 0x84, 0x1c, 0x2c, 0x06,
+ };
+ uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES];
+ HRSS_marshal_public_key(pub_bytes, &pub);
+ EXPECT_EQ(Bytes(pub_bytes), Bytes(kExpectedPub));
+
+ uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+ uint8_t shared_key[HRSS_KEY_BYTES];
+ OPENSSL_STATIC_ASSERT(
+ sizeof(kExpectedPub) >= HRSS_ENCAP_BYTES,
+ "Private key too small to use as input to HRSS encapsulation");
+ HRSS_encap(ciphertext, shared_key, &pub, kExpectedPub);
+
+ static const uint8_t kExpectedCiphertext[HRSS_CIPHERTEXT_BYTES] = {
+ 0x8e, 0x6b, 0x46, 0x9d, 0x4a, 0xef, 0xa6, 0x8c, 0x28, 0x7b, 0xec, 0x6f,
+ 0x13, 0x2d, 0x7f, 0x6c, 0xca, 0x7d, 0x9e, 0x6b, 0x54, 0x62, 0xa3, 0x13,
+ 0xe1, 0x1e, 0x8f, 0x5f, 0x71, 0x67, 0xc4, 0x85, 0xdf, 0xd5, 0x6b, 0xbd,
+ 0x86, 0x0f, 0x98, 0xec, 0xa5, 0x04, 0xf7, 0x7b, 0x2a, 0xbe, 0xcb, 0xac,
+ 0x29, 0xbe, 0xe1, 0x0f, 0xbc, 0x62, 0x87, 0x85, 0x7f, 0x05, 0xae, 0xe4,
+ 0x3f, 0x87, 0xfc, 0x1f, 0xf7, 0x45, 0x1e, 0xa3, 0xdb, 0xb1, 0xa0, 0x25,
+ 0xba, 0x82, 0xec, 0xca, 0x8d, 0xab, 0x7a, 0x20, 0x03, 0xeb, 0xe5, 0x5c,
+ 0x9f, 0xd0, 0x46, 0x78, 0xf1, 0x5a, 0xc7, 0x9e, 0xb4, 0x10, 0x6d, 0x37,
+ 0xc0, 0x75, 0x08, 0xfb, 0xeb, 0xcb, 0xd8, 0x35, 0x21, 0x9b, 0x89, 0xa0,
+ 0xaa, 0x87, 0x00, 0x66, 0x38, 0x37, 0x68, 0xa4, 0xa3, 0x93, 0x8e, 0x2b,
+ 0xca, 0xf7, 0x7a, 0x43, 0xb2, 0x15, 0x79, 0x81, 0xce, 0xa9, 0x09, 0xcb,
+ 0x29, 0xd4, 0xcc, 0xef, 0xf1, 0x9b, 0xbd, 0xe6, 0x63, 0xd5, 0x26, 0x0f,
+ 0xe8, 0x8b, 0xdf, 0xf1, 0xc3, 0xb4, 0x18, 0x0e, 0xf2, 0x1d, 0x5d, 0x82,
+ 0x9b, 0x1f, 0xf3, 0xca, 0x36, 0x2a, 0x26, 0x0a, 0x7f, 0xc4, 0x0d, 0xbd,
+ 0x5b, 0x15, 0x1c, 0x18, 0x6c, 0x11, 0x4e, 0xec, 0x36, 0x01, 0xc1, 0x15,
+ 0xab, 0xf7, 0x0b, 0x1a, 0xd3, 0xa1, 0xbd, 0x68, 0xc8, 0x59, 0xe7, 0x49,
+ 0x5c, 0xd5, 0x4b, 0x8c, 0x31, 0xdb, 0xb3, 0xea, 0x88, 0x09, 0x2f, 0xb9,
+ 0x8b, 0xfd, 0x96, 0x35, 0x88, 0x53, 0x72, 0x40, 0xcd, 0x89, 0x75, 0xb4,
+ 0x20, 0xf6, 0xf6, 0xe5, 0x74, 0x19, 0x48, 0xaf, 0x4b, 0xaa, 0x42, 0xa4,
+ 0xc8, 0x90, 0xee, 0xf3, 0x12, 0x04, 0x63, 0x90, 0x92, 0x8a, 0x89, 0xc3,
+ 0xa0, 0x7e, 0xfe, 0x19, 0xb3, 0x54, 0x53, 0x83, 0xe9, 0xc1, 0x6c, 0xe3,
+ 0x97, 0xa6, 0x27, 0xc3, 0x20, 0x9a, 0x79, 0x35, 0xc9, 0xb5, 0xc0, 0x90,
+ 0xe1, 0x56, 0x84, 0x69, 0xc2, 0x54, 0x77, 0x52, 0x48, 0x55, 0x71, 0x3e,
+ 0xcd, 0xa7, 0xd6, 0x25, 0x5d, 0x49, 0x13, 0xd2, 0x59, 0xd7, 0xe1, 0xd1,
+ 0x70, 0x46, 0xa0, 0xd4, 0xee, 0x59, 0x13, 0x1f, 0x1a, 0xd3, 0x39, 0x7d,
+ 0xb0, 0x79, 0xf7, 0xc0, 0x73, 0x5e, 0xbb, 0x08, 0xf7, 0x5c, 0xb0, 0x31,
+ 0x41, 0x3d, 0x7b, 0x1e, 0xf0, 0xe6, 0x47, 0x5c, 0x37, 0xd5, 0x54, 0xf1,
+ 0xbb, 0x64, 0xd7, 0x41, 0x8b, 0x34, 0x55, 0xaa, 0xc3, 0x5a, 0x9c, 0xa0,
+ 0xcc, 0x29, 0x8e, 0x5a, 0x1a, 0x93, 0x5a, 0x49, 0xd3, 0xd0, 0xa0, 0x56,
+ 0xda, 0x32, 0xa2, 0xa9, 0xa7, 0x13, 0x42, 0x93, 0x9b, 0x20, 0x32, 0x37,
+ 0x5c, 0x3e, 0x03, 0xa5, 0x28, 0x10, 0x93, 0xdd, 0xa0, 0x04, 0x7b, 0x2a,
+ 0xbd, 0x31, 0xc3, 0x6a, 0x89, 0x58, 0x6e, 0x55, 0x0e, 0xc9, 0x5c, 0x70,
+ 0x07, 0x10, 0xf1, 0x9a, 0xbd, 0xfb, 0xd2, 0xb7, 0x94, 0x5b, 0x4f, 0x8d,
+ 0x90, 0xfa, 0xee, 0xae, 0x37, 0x48, 0xc5, 0xf8, 0x16, 0xa1, 0x3b, 0x70,
+ 0x03, 0x1f, 0x0e, 0xb8, 0xbd, 0x8d, 0x30, 0x4f, 0x95, 0x31, 0x0b, 0x9f,
+ 0xfc, 0x80, 0xf8, 0xef, 0xa3, 0x3c, 0xbc, 0xe2, 0x23, 0x23, 0x3e, 0x2a,
+ 0x55, 0x11, 0xe8, 0x2c, 0x17, 0xea, 0x1c, 0xbd, 0x1d, 0x2d, 0x1b, 0xd5,
+ 0x16, 0x9e, 0x05, 0xfc, 0x89, 0x64, 0x50, 0x4d, 0x9a, 0x22, 0x50, 0xc6,
+ 0x5a, 0xd9, 0x58, 0x99, 0x8f, 0xbd, 0xf2, 0x4f, 0x2c, 0xdb, 0x51, 0x6a,
+ 0x86, 0xe2, 0xc6, 0x64, 0x8f, 0x54, 0x1a, 0xf2, 0xcb, 0x34, 0x88, 0x08,
+ 0xbd, 0x2a, 0x8f, 0xec, 0x29, 0xf5, 0x22, 0x36, 0x83, 0x99, 0xb9, 0x71,
+ 0x8c, 0x99, 0x5c, 0xec, 0x91, 0x78, 0xc1, 0xe2, 0x2d, 0xe9, 0xd1, 0x4d,
+ 0xf5, 0x15, 0x93, 0x4d, 0x93, 0x92, 0x9f, 0x0f, 0x33, 0x5e, 0xcd, 0x58,
+ 0x5f, 0x3d, 0x52, 0xb9, 0x38, 0x6a, 0x85, 0x63, 0x8b, 0x63, 0x29, 0xcb,
+ 0x67, 0x12, 0x25, 0xc2, 0x44, 0xd7, 0xab, 0x1a, 0x24, 0xca, 0x3d, 0xca,
+ 0x77, 0xce, 0x28, 0x68, 0x1a, 0x91, 0xed, 0x7b, 0xc9, 0x70, 0x84, 0xab,
+ 0xe2, 0xd4, 0xf4, 0xac, 0x58, 0xf6, 0x70, 0x99, 0xfc, 0x99, 0x4d, 0xbd,
+ 0xb4, 0x1b, 0x4f, 0x15, 0x86, 0x95, 0x08, 0xd1, 0x4e, 0x73, 0xa9, 0xbc,
+ 0x6a, 0x8c, 0xbc, 0xb5, 0x4b, 0xe0, 0xee, 0x35, 0x24, 0xf9, 0x12, 0xf5,
+ 0x88, 0x70, 0x50, 0x6c, 0xfe, 0x0d, 0x35, 0xbd, 0xf7, 0xc4, 0x2e, 0x39,
+ 0x16, 0x30, 0x6c, 0xf3, 0xb2, 0x19, 0x44, 0xaa, 0xcb, 0x4a, 0xf6, 0x75,
+ 0xb7, 0x09, 0xb9, 0xe1, 0x47, 0x71, 0x70, 0x5c, 0x05, 0x5f, 0x50, 0x50,
+ 0x9c, 0xd0, 0xe3, 0xc7, 0x91, 0xee, 0x6b, 0xc7, 0x0f, 0x71, 0x1b, 0xc3,
+ 0x48, 0x8b, 0xed, 0x15, 0x26, 0x8c, 0xc3, 0xd5, 0x54, 0x08, 0xcc, 0x33,
+ 0x79, 0xc0, 0x9f, 0x49, 0xc8, 0x75, 0xef, 0xb6, 0xf3, 0x29, 0x89, 0xfd,
+ 0x75, 0xd1, 0xda, 0x92, 0xc3, 0x13, 0xc6, 0x76, 0x51, 0x11, 0x40, 0x7b,
+ 0x82, 0xf7, 0x30, 0x79, 0x49, 0x04, 0xe3, 0xbb, 0x61, 0x34, 0xa6, 0x58,
+ 0x0b, 0x7d, 0xef, 0x3e, 0xf9, 0xb3, 0x8d, 0x2a, 0xba, 0xe9, 0xbc, 0xc0,
+ 0xa7, 0xe6, 0x6c, 0xda, 0xf8, 0x8c, 0xdf, 0x8d, 0x96, 0x83, 0x2d, 0x80,
+ 0x4f, 0x21, 0x81, 0xde, 0x57, 0x9d, 0x0a, 0x3c, 0xcc, 0xec, 0x3b, 0xb2,
+ 0x25, 0x96, 0x3c, 0xea, 0xfd, 0x46, 0x26, 0xbe, 0x1c, 0x79, 0x82, 0x1d,
+ 0xe0, 0x14, 0x22, 0x7c, 0x80, 0x3d, 0xbd, 0x05, 0x90, 0xfa, 0xaf, 0x7d,
+ 0x70, 0x13, 0x43, 0x0f, 0x3d, 0xa0, 0x7f, 0x92, 0x3a, 0x53, 0x69, 0xe4,
+ 0xb0, 0x10, 0x0d, 0xa7, 0x73, 0xa8, 0x8c, 0x74, 0xab, 0xd7, 0x78, 0x15,
+ 0x45, 0xec, 0x6e, 0xc8, 0x8b, 0xa0, 0xba, 0x21, 0x6f, 0xf3, 0x08, 0xb8,
+ 0xc7, 0x4f, 0x14, 0xf5, 0xcc, 0xfd, 0x39, 0xbc, 0x11, 0xf5, 0xb9, 0x11,
+ 0xba, 0xf3, 0x11, 0x24, 0x74, 0x3e, 0x0c, 0x07, 0x4f, 0xac, 0x2a, 0xb2,
+ 0xb1, 0x3c, 0x00, 0xfa, 0xbb, 0x8c, 0xd8, 0x7d, 0x17, 0x5b, 0x8d, 0x39,
+ 0xc6, 0x23, 0x31, 0x32, 0x7d, 0x6e, 0x20, 0x38, 0xd0, 0xc3, 0x58, 0xe2,
+ 0xb1, 0xfe, 0x53, 0x6b, 0xc7, 0x10, 0x13, 0x7e, 0xc6, 0x7c, 0x67, 0x59,
+ 0x43, 0x70, 0x4a, 0x2d, 0x7f, 0x76, 0xde, 0xbd, 0x45, 0x43, 0x56, 0x60,
+ 0xcd, 0xe9, 0x24, 0x7b, 0xb7, 0x41, 0xce, 0x56, 0xed, 0xd3, 0x74, 0x75,
+ 0xcc, 0x9d, 0x48, 0x61, 0xc8, 0x19, 0x66, 0x08, 0xfb, 0x28, 0x60, 0x1f,
+ 0x83, 0x11, 0xc0, 0x9b, 0xbd, 0x71, 0x53, 0x36, 0x01, 0x76, 0xa8, 0xc0,
+ 0xdc, 0x1d, 0x18, 0x85, 0x19, 0x65, 0xce, 0xcf, 0x14, 0x2e, 0x6c, 0x32,
+ 0x15, 0xbc, 0x2c, 0x5e, 0x8f, 0xfc, 0x3c, 0xf0, 0x2d, 0xf5, 0x5c, 0x04,
+ 0xc9, 0x22, 0xf4, 0xc3, 0xb8, 0x57, 0x79, 0x52, 0x41, 0xfd, 0xff, 0xcd,
+ 0x26, 0xa8, 0xc0, 0xd2, 0xe1, 0x71, 0xd6, 0xf1, 0xf4, 0x0c, 0xa8, 0xeb,
+ 0x0c, 0x33, 0x40, 0x25, 0x73, 0xbb, 0x31, 0xda, 0x0c, 0xa6, 0xee, 0x0c,
+ 0x41, 0x51, 0x94, 0x3c, 0x24, 0x27, 0x65, 0xe9, 0xb5, 0xc4, 0xe2, 0x88,
+ 0xc0, 0x82, 0xd0, 0x72, 0xd9, 0x10, 0x4d, 0x7f, 0xc0, 0x88, 0x94, 0x41,
+ 0x2d, 0x05, 0x09, 0xfb, 0x97, 0x31, 0x6e, 0xc1, 0xe9, 0xf4, 0x50, 0x70,
+ 0xdc, 0x3f, 0x0a, 0x90, 0x46, 0x37, 0x60, 0x8c, 0xfb, 0x06, 0x6e, 0xde,
+ 0x6f, 0xa7, 0x6b, 0xa3, 0x88, 0x18, 0x96, 0x93, 0x19, 0x87, 0xe7, 0x0a,
+ 0x98, 0xf0, 0x13, 0x01, 0xab, 0x7c, 0xeb, 0x25, 0xa5, 0xe2, 0x98, 0x44,
+ 0x7d, 0x09, 0xe2, 0x42, 0x33, 0xd4, 0xeb, 0xcc, 0x9b, 0x70, 0xf6, 0x0f,
+ 0xf0, 0xb2, 0x99, 0xcc, 0x4f, 0x64, 0xc4, 0x69, 0x12, 0xea, 0x56, 0xfe,
+ 0x50, 0x0e, 0x02, 0x1f, 0x6d, 0x7a, 0x79, 0x62, 0xaa, 0x2e, 0x52, 0xaf,
+ 0xa3, 0xed, 0xcd, 0xa7, 0x45, 0xe6, 0x86, 0xed, 0xa1, 0x73, 0x5b, 0x1e,
+ 0x49, 0x4f, 0x92, 0x50, 0x83, 0x99, 0x3c, 0xf4, 0xf6, 0xa8, 0x49, 0xd7,
+ 0x08, 0xf7, 0xdc, 0x28, 0x2c, 0xe6, 0x22, 0x6f, 0xf8, 0xfa, 0xba, 0x9e,
+ 0x0a, 0xcf, 0x72, 0x74, 0x76, 0x75, 0x99, 0x4d, 0x3d, 0x9a, 0x4c, 0x54,
+ 0xcd, 0xf8, 0x54, 0xf0, 0xbd, 0x73, 0xe9, 0x4f, 0x29, 0xd0, 0xe1, 0x24,
+ 0x94, 0x52, 0xd6, 0x60, 0x80, 0x71, 0x24, 0x95, 0x92, 0x01, 0x0e, 0xa9,
+ 0x7e, 0x64, 0x2e, 0xed, 0x51, 0xcc, 0xd2, 0xff, 0xfd, 0x0b,
+ };
+ EXPECT_EQ(Bytes(ciphertext), Bytes(kExpectedCiphertext));
+
+ static const uint8_t kExpectedSharedKey[HRSS_KEY_BYTES] = {
+ 0xbc, 0x98, 0x9c, 0x9c, 0x1f, 0x57, 0x6f, 0x38, 0x0b, 0x5d, 0xc2,
+ 0x23, 0x7d, 0x01, 0xae, 0x63, 0x17, 0xe8, 0xe4, 0xb2, 0x02, 0xa7,
+ 0xc4, 0x3a, 0x1b, 0x5a, 0xf3, 0xf8, 0xb5, 0xea, 0x6e, 0x22,
+ };
+ EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedSharedKey));
+
+ HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext));
+ EXPECT_EQ(Bytes(shared_key, sizeof(shared_key)),
+ Bytes(kExpectedSharedKey, sizeof(kExpectedSharedKey)));
+
+ // Corrupt the ciphertext and ensure that the failure key is constant.
+ ciphertext[50] ^= 4;
+ HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext));
+
+ static const uint8_t kExpectedFailureKey[HRSS_KEY_BYTES] = {
+ 0x8e, 0x19, 0xfe, 0x2b, 0x12, 0x67, 0xef, 0x9a, 0x63, 0x4d, 0x79,
+ 0x33, 0x8c, 0xce, 0xbf, 0x03, 0xdb, 0x9c, 0xc4, 0xc1, 0x70, 0xe1,
+ 0x32, 0xa6, 0xb3, 0xd3, 0xa1, 0x43, 0x3c, 0xf1, 0x1f, 0x5a,
+ };
+ EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedFailureKey));
+}
diff --git a/src/crypto/hrss/internal.h b/src/crypto/hrss/internal.h
new file mode 100644
index 00000000..70218b88
--- /dev/null
+++ b/src/crypto/hrss/internal.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_HRSS_INTERNAL_H
+#define OPENSSL_HEADER_HRSS_INTERNAL_H
+
+#include <openssl/base.h>
+#include "../internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#define N 701
+#define BITS_PER_WORD (sizeof(crypto_word_t) * 8)
+#define WORDS_PER_POLY ((N + BITS_PER_WORD - 1) / BITS_PER_WORD)
+#define BITS_IN_LAST_WORD (N % BITS_PER_WORD)
+
+struct poly2 {
+ crypto_word_t v[WORDS_PER_POLY];
+};
+
+struct poly3 {
+ struct poly2 s, a;
+};
+
+OPENSSL_EXPORT void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits);
+OPENSSL_EXPORT void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
+ const struct poly3 *y);
+OPENSSL_EXPORT void HRSS_poly3_invert(struct poly3 *out,
+ const struct poly3 *in);
+
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // !OPENSSL_HEADER_HRSS_INTERNAL_H
diff --git a/src/crypto/obj/obj_dat.h b/src/crypto/obj/obj_dat.h
index 0f5a3fa0..0313a08a 100644
--- a/src/crypto/obj/obj_dat.h
+++ b/src/crypto/obj/obj_dat.h
@@ -57,7 +57,7 @@
/* This file is generated by crypto/obj/objects.go. */
-#define NUM_NID 959
+#define NUM_NID 960
static const uint8_t kObjectData[] = {
/* NID_rsadsi */
@@ -8755,6 +8755,7 @@ static const ASN1_OBJECT kObjects[NUM_NID] = {
{"AuthPSK", "auth-psk", NID_auth_psk, 0, NULL, 0},
{"KxANY", "kx-any", NID_kx_any, 0, NULL, 0},
{"AuthANY", "auth-any", NID_auth_any, 0, NULL, 0},
+ {"CECPQ2", "CECPQ2", NID_CECPQ2, 0, NULL, 0},
};
static const unsigned kNIDsInShortNameOrder[] = {
@@ -8816,6 +8817,7 @@ static const unsigned kNIDsInShortNameOrder[] = {
110 /* CAST5-CFB */,
109 /* CAST5-ECB */,
111 /* CAST5-OFB */,
+ 959 /* CECPQ2 */,
894 /* CMAC */,
13 /* CN */,
141 /* CRLReason */,
@@ -9720,6 +9722,7 @@ static const unsigned kNIDsInLongNameOrder[] = {
285 /* Biometric Info */,
179 /* CA Issuers */,
785 /* CA Repository */,
+ 959 /* CECPQ2 */,
131 /* Code Signing */,
783 /* Diffie-Hellman based MAC */,
382 /* Directory */,
diff --git a/src/crypto/obj/obj_mac.num b/src/crypto/obj/obj_mac.num
index 6dbc0f13..5fa839d2 100644
--- a/src/crypto/obj/obj_mac.num
+++ b/src/crypto/obj/obj_mac.num
@@ -947,3 +947,4 @@ auth_ecdsa 955
auth_psk 956
kx_any 957
auth_any 958
+CECPQ2 959
diff --git a/src/crypto/obj/objects.txt b/src/crypto/obj/objects.txt
index 0c48e3c0..6dbb7ad7 100644
--- a/src/crypto/obj/objects.txt
+++ b/src/crypto/obj/objects.txt
@@ -559,7 +559,7 @@ id-cmc 19 : id-cmc-responseInfo
id-cmc 21 : id-cmc-queryPending
id-cmc 22 : id-cmc-popLinkRandom
id-cmc 23 : id-cmc-popLinkWitness
-id-cmc 24 : id-cmc-confirmCertAcceptance
+id-cmc 24 : id-cmc-confirmCertAcceptance
# other names
id-on 1 : id-on-personalData
@@ -1239,7 +1239,7 @@ cryptocom 1 8 1 : id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se
# Definitions for Camellia cipher - ECB, CFB, OFB MODE
!Alias ntt-ds 0 3 4401 5
-!Alias camellia ntt-ds 3 1 9
+!Alias camellia ntt-ds 3 1 9
camellia 1 : CAMELLIA-128-ECB : camellia-128-ecb
!Cname camellia-128-ofb128
@@ -1310,7 +1310,7 @@ ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH
1 3 36 3 3 2 8 1 1 11 : brainpoolP384r1
1 3 36 3 3 2 8 1 1 12 : brainpoolP384t1
1 3 36 3 3 2 8 1 1 13 : brainpoolP512r1
-1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1
+1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1
# ECDH schemes from RFC5753
!Alias x9-63-scheme 1 3 133 16 840 63 0
@@ -1334,6 +1334,9 @@ secg-scheme 14 3 : dhSinglePass-cofactorDH-sha512kdf-scheme
# NID for X25519 (no corresponding OID).
: X25519
+# NID for CECPQ2 (no corresponding OID).
+ : CECPQ2
+
# See RFC 8410.
1 3 101 112 : ED25519
diff --git a/src/crypto/thread_win.c b/src/crypto/thread_win.c
index 45011650..c8e19f51 100644
--- a/src/crypto/thread_win.c
+++ b/src/crypto/thread_win.c
@@ -82,7 +82,7 @@ void CRYPTO_STATIC_MUTEX_unlock_write(struct CRYPTO_STATIC_MUTEX *lock) {
ReleaseSRWLockExclusive(&lock->lock);
}
-static CRITICAL_SECTION g_destructors_lock;
+static SRWLOCK g_destructors_lock = SRWLOCK_INIT;
static thread_local_destructor_t g_destructors[NUM_OPENSSL_THREAD_LOCALS];
static CRYPTO_once_t g_thread_local_init_once = CRYPTO_ONCE_INIT;
@@ -90,10 +90,6 @@ static DWORD g_thread_local_key;
static int g_thread_local_failed;
static void thread_local_init(void) {
- if (!InitializeCriticalSectionAndSpinCount(&g_destructors_lock, 0x400)) {
- g_thread_local_failed = 1;
- return;
- }
g_thread_local_key = TlsAlloc();
g_thread_local_failed = (g_thread_local_key == TLS_OUT_OF_INDEXES);
}
@@ -121,12 +117,11 @@ static void NTAPI thread_local_destructor(PVOID module, DWORD reason,
thread_local_destructor_t destructors[NUM_OPENSSL_THREAD_LOCALS];
- EnterCriticalSection(&g_destructors_lock);
+ AcquireSRWLockExclusive(&g_destructors_lock);
OPENSSL_memcpy(destructors, g_destructors, sizeof(destructors));
- LeaveCriticalSection(&g_destructors_lock);
+ ReleaseSRWLockExclusive(&g_destructors_lock);
- unsigned i;
- for (i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) {
+ for (unsigned i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) {
if (destructors[i] != NULL) {
destructors[i](pointers[i]);
}
@@ -250,9 +245,9 @@ int CRYPTO_set_thread_local(thread_local_data_t index, void *value,
}
}
- EnterCriticalSection(&g_destructors_lock);
+ AcquireSRWLockExclusive(&g_destructors_lock);
g_destructors[index] = destructor;
- LeaveCriticalSection(&g_destructors_lock);
+ ReleaseSRWLockExclusive(&g_destructors_lock);
pointers[index] = value;
return 1;
diff --git a/src/crypto/x509/x509_test.cc b/src/crypto/x509/x509_test.cc
index c42a7c82..a53ed7a6 100644
--- a/src/crypto/x509/x509_test.cc
+++ b/src/crypto/x509/x509_test.cc
@@ -12,6 +12,7 @@
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#include <algorithm>
#include <functional>
#include <string>
#include <vector>
@@ -1684,3 +1685,62 @@ TEST(X509Test, ReadBIOEmpty) {
EXPECT_EQ(ERR_LIB_ASN1, ERR_GET_LIB(err));
EXPECT_EQ(ASN1_R_HEADER_TOO_LONG, ERR_GET_REASON(err));
}
+
+TEST(X509Test, ReadBIOOneByte) {
+ bssl::UniquePtr<BIO> bio(BIO_new_mem_buf("\x30", 1));
+ ASSERT_TRUE(bio);
+
+ // CPython expects |ASN1_R_HEADER_TOO_LONG| on EOF, to terminate a series of
+ // certificates. This EOF appeared after some data, however, so we do not wish
+ // to signal EOF.
+ bssl::UniquePtr<X509> x509(d2i_X509_bio(bio.get(), nullptr));
+ EXPECT_FALSE(x509);
+ uint32_t err = ERR_get_error();
+ EXPECT_EQ(ERR_LIB_ASN1, ERR_GET_LIB(err));
+ EXPECT_EQ(ASN1_R_NOT_ENOUGH_DATA, ERR_GET_REASON(err));
+}
+
+TEST(X509Test, PartialBIOReturn) {
+ // Create a filter BIO that only reads and writes one byte at a time.
+ bssl::UniquePtr<BIO_METHOD> method(BIO_meth_new(0, nullptr));
+ ASSERT_TRUE(method);
+ ASSERT_TRUE(BIO_meth_set_create(method.get(), [](BIO *b) -> int {
+ BIO_set_init(b, 1);
+ return 1;
+ }));
+ ASSERT_TRUE(
+ BIO_meth_set_read(method.get(), [](BIO *b, char *out, int len) -> int {
+ return BIO_read(BIO_next(b), out, std::min(len, 1));
+ }));
+ ASSERT_TRUE(BIO_meth_set_write(
+ method.get(), [](BIO *b, const char *in, int len) -> int {
+ return BIO_write(BIO_next(b), in, std::min(len, 1));
+ }));
+
+ bssl::UniquePtr<BIO> bio(BIO_new(method.get()));
+ ASSERT_TRUE(bio);
+ BIO *mem_bio = BIO_new(BIO_s_mem());
+ ASSERT_TRUE(mem_bio);
+ BIO_push(bio.get(), mem_bio); // BIO_push takes ownership.
+
+ bssl::UniquePtr<X509> cert(CertFromPEM(kLeafPEM));
+ ASSERT_TRUE(cert);
+ uint8_t *der = nullptr;
+ int der_len = i2d_X509(cert.get(), &der);
+ ASSERT_GT(der_len, 0);
+ bssl::UniquePtr<uint8_t> free_der(der);
+
+ // Write the certificate into the BIO. Though we only write one byte at a
+ // time, the write should succeed.
+ ASSERT_EQ(1, i2d_X509_bio(bio.get(), cert.get()));
+ const uint8_t *der2;
+ size_t der2_len;
+ ASSERT_TRUE(BIO_mem_contents(mem_bio, &der2, &der2_len));
+ EXPECT_EQ(Bytes(der, static_cast<size_t>(der_len)), Bytes(der2, der2_len));
+
+ // Read the certificate back out of the BIO. Though we only read one byte at a
+ // time, the read should succeed.
+ bssl::UniquePtr<X509> cert2(d2i_X509_bio(bio.get(), nullptr));
+ ASSERT_TRUE(cert2);
+ EXPECT_EQ(0, X509_cmp(cert.get(), cert2.get()));
+}
diff --git a/src/include/openssl/bio.h b/src/include/openssl/bio.h
index 8e2db65f..da0dcdfe 100644
--- a/src/include/openssl/bio.h
+++ b/src/include/openssl/bio.h
@@ -904,6 +904,7 @@ BSSL_NAMESPACE_BEGIN
BORINGSSL_MAKE_DELETER(BIO, BIO_free)
BORINGSSL_MAKE_UP_REF(BIO, BIO_up_ref)
+BORINGSSL_MAKE_DELETER(BIO_METHOD, BIO_meth_free)
BSSL_NAMESPACE_END
diff --git a/src/include/openssl/bn.h b/src/include/openssl/bn.h
index c895cc14..c198f4df 100644
--- a/src/include/openssl/bn.h
+++ b/src/include/openssl/bn.h
@@ -160,7 +160,7 @@ extern "C" {
#define BN_DEC_FMT1 "%" PRIu32
#define BN_DEC_FMT2 "%09" PRIu32
#define BN_HEX_FMT1 "%" PRIx32
-#define BN_HEX_FMT2 "%08" PRIx64
+#define BN_HEX_FMT2 "%08" PRIx32
#else
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
#endif
diff --git a/src/include/openssl/hrss.h b/src/include/openssl/hrss.h
new file mode 100644
index 00000000..cc5edffb
--- /dev/null
+++ b/src/include/openssl/hrss.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_HRSS_H
+#define OPENSSL_HEADER_HRSS_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// HRSS
+//
+// HRSS is a structured-lattice-based post-quantum key encapsulation mechanism.
+// The best exposition is https://eprint.iacr.org/2017/667.pdf although this
+// implementation uses a different KEM construction based on
+// https://eprint.iacr.org/2017/1005.pdf.
+
+struct HRSS_private_key {
+ uint8_t opaque[1808];
+};
+
+struct HRSS_public_key {
+ uint8_t opaque[1424];
+};
+
+// HRSS_SAMPLE_BYTES is the number of bytes of entropy needed to generate a
+// short vector. There are 701 coefficients, but the final one is always set to
+// zero when sampling. Otherwise, one byte of input is enough to generate two
+// coefficients.
+#define HRSS_SAMPLE_BYTES ((701 - 1) / 2)
+// HRSS_GENERATE_KEY_BYTES is the number of bytes of entropy needed to generate
+// an HRSS key pair.
+#define HRSS_GENERATE_KEY_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32)
+// HRSS_ENCAP_BYTES is the number of bytes of entropy needed to encapsulate a
+// session key.
+#define HRSS_ENCAP_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES)
+// HRSS_PUBLIC_KEY_BYTES is the number of bytes in a public key.
+#define HRSS_PUBLIC_KEY_BYTES 1138
+// HRSS_CIPHERTEXT_BYTES is the number of bytes in a ciphertext.
+#define HRSS_CIPHERTEXT_BYTES 1138
+// HRSS_KEY_BYTES is the number of bytes in a shared key.
+#define HRSS_KEY_BYTES 32
+// HRSS_POLY3_BYTES is the number of bytes needed to serialise a mod 3
+// polynomial.
+#define HRSS_POLY3_BYTES 140
+#define HRSS_PRIVATE_KEY_BYTES \
+ (HRSS_POLY3_BYTES * 2 + HRSS_PUBLIC_KEY_BYTES + 2 + 32)
+
+// HRSS_generate_key is a deterministic function that outputs a public and
+// private key based on the given entropy.
+OPENSSL_EXPORT void HRSS_generate_key(
+ struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
+ const uint8_t input[HRSS_GENERATE_KEY_BYTES]);
+
+// HRSS_encap is a deterministic function the generates and encrypts a random
+// session key from the given entropy, writing those values to |out_shared_key|
+// and |out_ciphertext|, respectively.
+OPENSSL_EXPORT void HRSS_encap(uint8_t out_ciphertext[HRSS_CIPHERTEXT_BYTES],
+ uint8_t out_shared_key[HRSS_KEY_BYTES],
+ const struct HRSS_public_key *in_pub,
+ const uint8_t in[HRSS_ENCAP_BYTES]);
+
+// HRSS_decap decrypts a session key from |ciphertext_len| bytes of
+// |ciphertext|. If the ciphertext is valid, the decrypted key is written to
+// |out_shared_key|. Otherwise the HMAC of |ciphertext| under a secret key (kept
+// in |in_priv|) is written. If the ciphertext is the wrong length then it will
+// leak which was done via side-channels. Otherwise it should perform either
+// action in constant-time.
+OPENSSL_EXPORT void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
+ const struct HRSS_public_key *in_pub,
+ const struct HRSS_private_key *in_priv,
+ const uint8_t *ciphertext,
+ size_t ciphertext_len);
+
+// HRSS_marshal_public_key serialises |in_pub| to |out|.
+OPENSSL_EXPORT void HRSS_marshal_public_key(
+ uint8_t out[HRSS_PUBLIC_KEY_BYTES], const struct HRSS_public_key *in_pub);
+
+// HRSS_parse_public_key sets |*out| to the public-key encoded in |in|. It
+// returns true on success and zero on error.
+OPENSSL_EXPORT int HRSS_parse_public_key(
+ struct HRSS_public_key *out, const uint8_t in[HRSS_PUBLIC_KEY_BYTES]);
+
+
+#if defined(__cplusplus)
+} // extern C
+#endif
+
+#endif // OPENSSL_HEADER_HRSS_H
diff --git a/src/include/openssl/nid.h b/src/include/openssl/nid.h
index afeb2dea..270d443a 100644
--- a/src/include/openssl/nid.h
+++ b/src/include/openssl/nid.h
@@ -4234,6 +4234,9 @@ extern "C" {
#define LN_auth_any "auth-any"
#define NID_auth_any 958
+#define SN_CECPQ2 "CECPQ2"
+#define NID_CECPQ2 959
+
#if defined(__cplusplus)
} /* extern C */
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index 17c55925..6898674a 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -2177,6 +2177,7 @@ OPENSSL_EXPORT int SSL_set1_curves_list(SSL *ssl, const char *curves);
#define SSL_CURVE_SECP384R1 24
#define SSL_CURVE_SECP521R1 25
#define SSL_CURVE_X25519 29
+#define SSL_CURVE_CECPQ2 16696
// SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently
// completed handshake or 0 if not applicable.
@@ -4715,6 +4716,14 @@ OPENSSL_EXPORT bool SSL_apply_handoff(SSL *ssl, Span<const uint8_t> handoff);
OPENSSL_EXPORT bool SSL_serialize_handback(const SSL *ssl, CBB *out);
OPENSSL_EXPORT bool SSL_apply_handback(SSL *ssl, Span<const uint8_t> handback);
+// SSL_get_traffic_secrets sets |*out_read_traffic_secret| and
+// |*out_write_traffic_secret| to reference the TLS 1.3 traffic secrets for
+// |ssl|. This function is only valid on TLS 1.3 connections that have
+// completed the handshake. It returns true on success and false on error.
+OPENSSL_EXPORT bool SSL_get_traffic_secrets(
+ const SSL *ssl, Span<const uint8_t> *out_read_traffic_secret,
+ Span<const uint8_t> *out_write_traffic_secret);
+
BSSL_NAMESPACE_END
} // extern C++
diff --git a/src/ssl/handoff.cc b/src/ssl/handoff.cc
index 4cca9818..f9dbd135 100644
--- a/src/ssl/handoff.cc
+++ b/src/ssl/handoff.cc
@@ -307,7 +307,7 @@ bool SSL_serialize_handback(const SSL *ssl, CBB *out) {
return false;
}
if (type == handback_after_ecdhe &&
- !s3->hs->key_share->Serialize(&key_share)) {
+ !s3->hs->key_shares[0]->Serialize(&key_share)) {
return false;
}
return CBB_flush(out);
@@ -471,7 +471,7 @@ bool SSL_apply_handback(SSL *ssl, Span<const uint8_t> handback) {
return false;
}
if (type == handback_after_ecdhe &&
- (s3->hs->key_share = SSLKeyShare::Create(&key_share)) == nullptr) {
+ (s3->hs->key_shares[0] = SSLKeyShare::Create(&key_share)) == nullptr) {
return false;
}
diff --git a/src/ssl/handshake_client.cc b/src/ssl/handshake_client.cc
index c1d54bd8..0274dc2a 100644
--- a/src/ssl/handshake_client.cc
+++ b/src/ssl/handshake_client.cc
@@ -590,7 +590,8 @@ static enum ssl_hs_wait_t do_read_server_hello(SSL_HANDSHAKE *hs) {
}
// Clear some TLS 1.3 state that no longer needs to be retained.
- hs->key_share.reset();
+ hs->key_shares[0].reset();
+ hs->key_shares[1].reset();
hs->key_share_bytes.Reset();
// A TLS 1.2 server would not know to skip the early data we offered. Report
@@ -1006,8 +1007,8 @@ static enum ssl_hs_wait_t do_read_server_key_exchange(SSL_HANDSHAKE *hs) {
}
// Initialize ECDH and save the peer public key for later.
- hs->key_share = SSLKeyShare::Create(group_id);
- if (!hs->key_share ||
+ hs->key_shares[0] = SSLKeyShare::Create(group_id);
+ if (!hs->key_shares[0] ||
!hs->peer_key.CopyFrom(point)) {
return ssl_hs_error;
}
@@ -1324,7 +1325,7 @@ static enum ssl_hs_wait_t do_send_client_key_exchange(SSL_HANDSHAKE *hs) {
// Compute the premaster.
uint8_t alert = SSL_AD_DECODE_ERROR;
- if (!hs->key_share->Accept(&child, &pms, &alert, hs->peer_key)) {
+ if (!hs->key_shares[0]->Accept(&child, &pms, &alert, hs->peer_key)) {
ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
return ssl_hs_error;
}
@@ -1333,7 +1334,8 @@ static enum ssl_hs_wait_t do_send_client_key_exchange(SSL_HANDSHAKE *hs) {
}
// The key exchange state may now be discarded.
- hs->key_share.reset();
+ hs->key_shares[0].reset();
+ hs->key_shares[1].reset();
hs->peer_key.Reset();
} else if (alg_k & SSL_kPSK) {
// For plain PSK, other_secret is a block of 0s with the same length as
diff --git a/src/ssl/handshake_server.cc b/src/ssl/handshake_server.cc
index c4f3b75e..8b3b9428 100644
--- a/src/ssl/handshake_server.cc
+++ b/src/ssl/handshake_server.cc
@@ -932,12 +932,12 @@ static enum ssl_hs_wait_t do_send_server_certificate(SSL_HANDSHAKE *hs) {
hs->new_session->group_id = group_id;
// Set up ECDH, generate a key, and emit the public half.
- hs->key_share = SSLKeyShare::Create(group_id);
- if (!hs->key_share ||
+ hs->key_shares[0] = SSLKeyShare::Create(group_id);
+ if (!hs->key_shares[0] ||
!CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) ||
!CBB_add_u16(cbb.get(), group_id) ||
!CBB_add_u8_length_prefixed(cbb.get(), &child) ||
- !hs->key_share->Offer(&child)) {
+ !hs->key_shares[0]->Offer(&child)) {
return ssl_hs_error;
}
} else {
@@ -1275,13 +1275,14 @@ static enum ssl_hs_wait_t do_read_client_key_exchange(SSL_HANDSHAKE *hs) {
// Compute the premaster.
uint8_t alert = SSL_AD_DECODE_ERROR;
- if (!hs->key_share->Finish(&premaster_secret, &alert, peer_key)) {
+ if (!hs->key_shares[0]->Finish(&premaster_secret, &alert, peer_key)) {
ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
return ssl_hs_error;
}
// The key exchange state may now be discarded.
- hs->key_share.reset();
+ hs->key_shares[0].reset();
+ hs->key_shares[1].reset();
} else if (!(alg_k & SSL_kPSK)) {
OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index f8a2ea70..bbce7ec4 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -974,10 +974,10 @@ class SSLKeyShare {
// |out_public_key|. It returns true on success and false on error.
virtual bool Offer(CBB *out_public_key) PURE_VIRTUAL;
- // Accept performs a key exchange against the |peer_key| generated by |offer|.
+ // Accept performs a key exchange against the |peer_key| generated by |Offer|.
// On success, it returns true, writes the public value to |out_public_key|,
- // and sets |*out_secret| the shared secret. On failure, it returns false and
- // sets |*out_alert| to an alert to send to the peer.
+ // and sets |*out_secret| to the shared secret. On failure, it returns false
+ // and sets |*out_alert| to an alert to send to the peer.
//
// The default implementation calls |Offer| and then |Finish|, assuming a key
// exchange protocol where the peers are symmetric.
@@ -986,7 +986,7 @@ class SSLKeyShare {
// Finish performs a key exchange against the |peer_key| generated by
// |Accept|. On success, it returns true and sets |*out_secret| to the shared
- // secret. On failure, it returns zero and sets |*out_alert| to an alert to
+ // secret. On failure, it returns false and sets |*out_alert| to an alert to
// send to the peer.
virtual bool Finish(Array<uint8_t> *out_secret, uint8_t *out_alert,
Span<const uint8_t> peer_key) PURE_VIRTUAL;
@@ -1436,8 +1436,10 @@ struct SSL_HANDSHAKE {
// error, if |wait| is |ssl_hs_error|, is the error the handshake failed on.
UniquePtr<ERR_SAVE_STATE> error;
- // key_share is the current key exchange instance.
- UniquePtr<SSLKeyShare> key_share;
+ // key_shares are the current key exchange instances. The second is only used
+ // as a client if we believe that we should offer two key shares in a
+ // ClientHello.
+ UniquePtr<SSLKeyShare> key_shares[2];
// transcript is the current handshake transcript.
SSLTranscript transcript;
diff --git a/src/ssl/ssl_asn1.cc b/src/ssl/ssl_asn1.cc
index 669f776d..3fd7fb6a 100644
--- a/src/ssl/ssl_asn1.cc
+++ b/src/ssl/ssl_asn1.cc
@@ -697,11 +697,6 @@ UniquePtr<SSL_SESSION> SSL_SESSION_parse(CBS *cbs,
}
}
- if (!x509_method->session_cache_objects(ret.get())) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION);
- return nullptr;
- }
-
CBS age_add;
int age_add_present;
if (!CBS_get_optional_asn1_octet_string(&session, &age_add, &age_add_present,
@@ -737,6 +732,11 @@ UniquePtr<SSL_SESSION> SSL_SESSION_parse(CBS *cbs,
return nullptr;
}
+ if (!x509_method->session_cache_objects(ret.get())) {
+ OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION);
+ return nullptr;
+ }
+
return ret;
}
diff --git a/src/ssl/ssl_key_share.cc b/src/ssl/ssl_key_share.cc
index 80b7d0a0..108ea6a9 100644
--- a/src/ssl/ssl_key_share.cc
+++ b/src/ssl/ssl_key_share.cc
@@ -24,8 +24,10 @@
#include <openssl/curve25519.h>
#include <openssl/ec.h>
#include <openssl/err.h>
+#include <openssl/hrss.h>
#include <openssl/mem.h>
#include <openssl/nid.h>
+#include <openssl/rand.h>
#include "internal.h"
#include "../crypto/internal.h"
@@ -38,7 +40,6 @@ namespace {
class ECKeyShare : public SSLKeyShare {
public:
ECKeyShare(int nid, uint16_t group_id) : nid_(nid), group_id_(group_id) {}
- ~ECKeyShare() override {}
uint16_t GroupID() const override { return group_id_; }
@@ -159,9 +160,6 @@ class ECKeyShare : public SSLKeyShare {
class X25519KeyShare : public SSLKeyShare {
public:
X25519KeyShare() {}
- ~X25519KeyShare() override {
- OPENSSL_cleanse(private_key_, sizeof(private_key_));
- }
uint16_t GroupID() const override { return SSL_CURVE_X25519; }
@@ -211,12 +209,104 @@ class X25519KeyShare : public SSLKeyShare {
uint8_t private_key_[32];
};
+class CECPQ2KeyShare : public SSLKeyShare {
+ public:
+ CECPQ2KeyShare() {}
+
+ uint16_t GroupID() const override { return SSL_CURVE_CECPQ2; }
+
+ bool Offer(CBB *out) override {
+ uint8_t x25519_public_key[32];
+ X25519_keypair(x25519_public_key, x25519_private_key_);
+
+ uint8_t hrss_entropy[HRSS_GENERATE_KEY_BYTES];
+ RAND_bytes(hrss_entropy, sizeof(hrss_entropy));
+ HRSS_generate_key(&hrss_public_key_, &hrss_private_key_, hrss_entropy);
+
+ uint8_t hrss_public_key_bytes[HRSS_PUBLIC_KEY_BYTES];
+ HRSS_marshal_public_key(hrss_public_key_bytes, &hrss_public_key_);
+
+ if (!CBB_add_bytes(out, x25519_public_key, sizeof(x25519_public_key)) ||
+ !CBB_add_bytes(out, hrss_public_key_bytes,
+ sizeof(hrss_public_key_bytes))) {
+ return false;
+ }
+
+ return true;
+ };
+
+ bool Accept(CBB *out_public_key, Array<uint8_t> *out_secret,
+ uint8_t *out_alert, Span<const uint8_t> peer_key) override {
+ Array<uint8_t> secret;
+ if (!secret.Init(32 + HRSS_KEY_BYTES)) {
+ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+ return false;
+ }
+
+ uint8_t x25519_public_key[32];
+ X25519_keypair(x25519_public_key, x25519_private_key_);
+
+ HRSS_public_key peer_public_key;
+ if (peer_key.size() != 32 + HRSS_PUBLIC_KEY_BYTES ||
+ !HRSS_parse_public_key(&peer_public_key, peer_key.data() + 32) ||
+ !X25519(secret.data(), x25519_private_key_, peer_key.data())) {
+ *out_alert = SSL_AD_DECODE_ERROR;
+ OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
+ return false;
+ }
+
+ uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+ uint8_t entropy[HRSS_ENCAP_BYTES];
+ RAND_bytes(entropy, sizeof(entropy));
+ HRSS_encap(ciphertext, secret.data() + 32, &peer_public_key, entropy);
+
+ if (!CBB_add_bytes(out_public_key, x25519_public_key,
+ sizeof(x25519_public_key)) ||
+ !CBB_add_bytes(out_public_key, ciphertext, sizeof(ciphertext))) {
+ return false;
+ }
+
+ *out_secret = std::move(secret);
+ return true;
+ }
+
+ bool Finish(Array<uint8_t> *out_secret, uint8_t *out_alert,
+ Span<const uint8_t> peer_key) override {
+ *out_alert = SSL_AD_INTERNAL_ERROR;
+
+ Array<uint8_t> secret;
+ if (!secret.Init(32 + HRSS_KEY_BYTES)) {
+ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+ return false;
+ }
+
+ if (peer_key.size() != 32 + HRSS_CIPHERTEXT_BYTES ||
+ !X25519(secret.data(), x25519_private_key_, peer_key.data())) {
+ *out_alert = SSL_AD_DECODE_ERROR;
+ OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
+ return false;
+ }
+
+ HRSS_decap(secret.data() + 32, &hrss_public_key_, &hrss_private_key_,
+ peer_key.data() + 32, peer_key.size() - 32);
+
+ *out_secret = std::move(secret);
+ return true;
+ };
+
+ private:
+ uint8_t x25519_private_key_[32];
+ HRSS_public_key hrss_public_key_;
+ HRSS_private_key hrss_private_key_;
+};
+
CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = {
{NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"},
{NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"},
{NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"},
{NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"},
{NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"},
+ {NID_CECPQ2, SSL_CURVE_CECPQ2, "CECPQ2", "CECPQ2"},
};
} // namespace
@@ -241,6 +331,8 @@ UniquePtr<SSLKeyShare> SSLKeyShare::Create(uint16_t group_id) {
New<ECKeyShare>(NID_secp521r1, SSL_CURVE_SECP521R1));
case SSL_CURVE_X25519:
return UniquePtr<SSLKeyShare>(New<X25519KeyShare>());
+ case SSL_CURVE_CECPQ2:
+ return UniquePtr<SSLKeyShare>(New<CECPQ2KeyShare>());
default:
return nullptr;
}
diff --git a/src/ssl/ssl_lib.cc b/src/ssl/ssl_lib.cc
index b9c823d9..ceeba89c 100644
--- a/src/ssl/ssl_lib.cc
+++ b/src/ssl/ssl_lib.cc
@@ -506,6 +506,27 @@ void SSL_set_handoff_mode(SSL *ssl, bool on) {
ssl->config->handoff = on;
}
+bool SSL_get_traffic_secrets(const SSL *ssl,
+ Span<const uint8_t> *out_read_traffic_secret,
+ Span<const uint8_t> *out_write_traffic_secret) {
+ if (SSL_version(ssl) < TLS1_3_VERSION) {
+ OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_SSL_VERSION);
+ return false;
+ }
+
+ if (!ssl->s3->initial_handshake_complete) {
+ OPENSSL_PUT_ERROR(SSL, SSL_R_HANDSHAKE_NOT_COMPLETE);
+ return false;
+ }
+
+ *out_read_traffic_secret = Span<const uint8_t>(
+ ssl->s3->read_traffic_secret, ssl->s3->read_traffic_secret_len);
+ *out_write_traffic_secret = Span<const uint8_t>(
+ ssl->s3->write_traffic_secret, ssl->s3->write_traffic_secret_len);
+
+ return true;
+}
+
BSSL_NAMESPACE_END
using namespace bssl;
diff --git a/src/ssl/ssl_test.cc b/src/ssl/ssl_test.cc
index 470379c0..8d01c03a 100644
--- a/src/ssl/ssl_test.cc
+++ b/src/ssl/ssl_test.cc
@@ -395,6 +395,11 @@ static const CurveTest kCurveTests[] = {
{ SSL_CURVE_SECP256R1 },
},
{
+ "P-256:CECPQ2",
+ { SSL_CURVE_SECP256R1, SSL_CURVE_CECPQ2 },
+ },
+
+ {
"P-256:P-384:P-521:X25519",
{
SSL_CURVE_SECP256R1,
@@ -4516,6 +4521,65 @@ TEST(SSLTest, GetCertificateThreads) {
EXPECT_EQ(cert2, cert2_thread);
EXPECT_EQ(0, X509_cmp(cert.get(), cert2));
}
+
+// Functions which access properties on the negotiated session are thread-safe
+// where needed. Prior to TLS 1.3, clients resuming sessions and servers
+// performing stateful resumption will share an underlying SSL_SESSION object,
+// potentially across threads.
+TEST_P(SSLVersionTest, SessionPropertiesThreads) {
+ if (version() == TLS1_3_VERSION) {
+ // Our TLS 1.3 implementation does not support stateful resumption.
+ ASSERT_FALSE(CreateClientSession(client_ctx_.get(), server_ctx_.get()));
+ return;
+ }
+
+ SSL_CTX_set_options(server_ctx_.get(), SSL_OP_NO_TICKET);
+ SSL_CTX_set_session_cache_mode(client_ctx_.get(), SSL_SESS_CACHE_BOTH);
+ SSL_CTX_set_session_cache_mode(server_ctx_.get(), SSL_SESS_CACHE_BOTH);
+
+ ASSERT_TRUE(UseCertAndKey(client_ctx_.get()));
+ ASSERT_TRUE(UseCertAndKey(server_ctx_.get()));
+
+ // Configure mutual authentication, so we have more session state.
+ SSL_CTX_set_custom_verify(
+ client_ctx_.get(), SSL_VERIFY_PEER,
+ [](SSL *ssl, uint8_t *out_alert) { return ssl_verify_ok; });
+ SSL_CTX_set_custom_verify(
+ server_ctx_.get(), SSL_VERIFY_PEER,
+ [](SSL *ssl, uint8_t *out_alert) { return ssl_verify_ok; });
+
+ // Establish a client session to test with.
+ bssl::UniquePtr<SSL_SESSION> session =
+ CreateClientSession(client_ctx_.get(), server_ctx_.get());
+ ASSERT_TRUE(session);
+
+ // Resume with it twice.
+ UniquePtr<SSL> ssls[4];
+ ClientConfig config;
+ config.session = session.get();
+ ASSERT_TRUE(ConnectClientAndServer(&ssls[0], &ssls[1], client_ctx_.get(),
+ server_ctx_.get(), config));
+ ASSERT_TRUE(ConnectClientAndServer(&ssls[2], &ssls[3], client_ctx_.get(),
+ server_ctx_.get(), config));
+
+ // Read properties in parallel.
+ auto read_properties = [](const SSL *ssl) {
+ EXPECT_TRUE(SSL_get_peer_cert_chain(ssl));
+ bssl::UniquePtr<X509> peer(SSL_get_peer_certificate(ssl));
+ EXPECT_TRUE(peer);
+ EXPECT_TRUE(SSL_get_current_cipher(ssl));
+ EXPECT_TRUE(SSL_get_curve_id(ssl));
+ };
+
+ std::vector<std::thread> threads;
+ for (const auto &ssl_ptr : ssls) {
+ const SSL *ssl = ssl_ptr.get();
+ threads.emplace_back([=] { read_properties(ssl); });
+ }
+ for (auto &thread : threads) {
+ thread.join();
+ }
+}
#endif
constexpr size_t kNumQUICLevels = 4;
diff --git a/src/ssl/ssl_x509.cc b/src/ssl/ssl_x509.cc
index ec203b22..eb3a38b7 100644
--- a/src/ssl/ssl_x509.cc
+++ b/src/ssl/ssl_x509.cc
@@ -281,16 +281,25 @@ static void ssl_crypto_x509_cert_dup(CERT *new_cert, const CERT *cert) {
}
static int ssl_crypto_x509_session_cache_objects(SSL_SESSION *sess) {
- bssl::UniquePtr<STACK_OF(X509)> chain;
+ bssl::UniquePtr<STACK_OF(X509)> chain, chain_without_leaf;
if (sk_CRYPTO_BUFFER_num(sess->certs.get()) > 0) {
chain.reset(sk_X509_new_null());
if (!chain) {
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
return 0;
}
+ if (sess->is_server) {
+ // chain_without_leaf is only needed for server sessions. See
+ // |SSL_get_peer_cert_chain|.
+ chain_without_leaf.reset(sk_X509_new_null());
+ if (!chain_without_leaf) {
+ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+ return 0;
+ }
+ }
}
- X509 *leaf = nullptr;
+ bssl::UniquePtr<X509> leaf;
for (CRYPTO_BUFFER *cert : sess->certs.get()) {
UniquePtr<X509> x509(X509_parse_from_buffer(cert));
if (!x509) {
@@ -298,7 +307,11 @@ static int ssl_crypto_x509_session_cache_objects(SSL_SESSION *sess) {
return 0;
}
if (leaf == nullptr) {
- leaf = x509.get();
+ leaf = UpRef(x509);
+ } else if (chain_without_leaf &&
+ !PushToStack(chain_without_leaf.get(), UpRef(x509))) {
+ OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+ return 0;
}
if (!PushToStack(chain.get(), std::move(x509))) {
OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
@@ -308,26 +321,28 @@ static int ssl_crypto_x509_session_cache_objects(SSL_SESSION *sess) {
sk_X509_pop_free(sess->x509_chain, X509_free);
sess->x509_chain = chain.release();
+
sk_X509_pop_free(sess->x509_chain_without_leaf, X509_free);
- sess->x509_chain_without_leaf = NULL;
+ sess->x509_chain_without_leaf = chain_without_leaf.release();
X509_free(sess->x509_peer);
- if (leaf != NULL) {
- X509_up_ref(leaf);
- }
- sess->x509_peer = leaf;
+ sess->x509_peer = leaf.release();
return 1;
}
static int ssl_crypto_x509_session_dup(SSL_SESSION *new_session,
const SSL_SESSION *session) {
- if (session->x509_peer != NULL) {
- X509_up_ref(session->x509_peer);
- new_session->x509_peer = session->x509_peer;
- }
- if (session->x509_chain != NULL) {
+ new_session->x509_peer = UpRef(session->x509_peer).release();
+ if (session->x509_chain != nullptr) {
new_session->x509_chain = X509_chain_up_ref(session->x509_chain);
- if (new_session->x509_chain == NULL) {
+ if (new_session->x509_chain == nullptr) {
+ return 0;
+ }
+ }
+ if (session->x509_chain_without_leaf != nullptr) {
+ new_session->x509_chain_without_leaf =
+ X509_chain_up_ref(session->x509_chain_without_leaf);
+ if (new_session->x509_chain_without_leaf == nullptr) {
return 0;
}
}
@@ -525,38 +540,17 @@ X509 *SSL_get_peer_certificate(const SSL *ssl) {
STACK_OF(X509) *SSL_get_peer_cert_chain(const SSL *ssl) {
check_ssl_x509_method(ssl);
- if (ssl == NULL) {
- return NULL;
+ if (ssl == nullptr) {
+ return nullptr;
}
SSL_SESSION *session = SSL_get_session(ssl);
- if (session == NULL ||
- session->x509_chain == NULL) {
- return NULL;
- }
-
- if (!ssl->server) {
- return session->x509_chain;
+ if (session == nullptr) {
+ return nullptr;
}
// OpenSSL historically didn't include the leaf certificate in the returned
// certificate chain, but only for servers.
- if (session->x509_chain_without_leaf == NULL) {
- session->x509_chain_without_leaf = sk_X509_new_null();
- if (session->x509_chain_without_leaf == NULL) {
- return NULL;
- }
-
- for (size_t i = 1; i < sk_X509_num(session->x509_chain); i++) {
- X509 *cert = sk_X509_value(session->x509_chain, i);
- if (!PushToStack(session->x509_chain_without_leaf, UpRef(cert))) {
- sk_X509_pop_free(session->x509_chain_without_leaf, X509_free);
- session->x509_chain_without_leaf = NULL;
- return NULL;
- }
- }
- }
-
- return session->x509_chain_without_leaf;
+ return ssl->server ? session->x509_chain_without_leaf : session->x509_chain;
}
STACK_OF(X509) *SSL_get_peer_full_cert_chain(const SSL *ssl) {
diff --git a/src/ssl/t1_lib.cc b/src/ssl/t1_lib.cc
index 678e4a3b..5e65f819 100644
--- a/src/ssl/t1_lib.cc
+++ b/src/ssl/t1_lib.cc
@@ -292,10 +292,23 @@ static const uint16_t kDefaultGroups[] = {
SSL_CURVE_SECP384R1,
};
+// TLS 1.3 servers will pick CECPQ2 if offered by a client, but it's not enabled
+// by default for clients.
+static const uint16_t kDefaultGroupsServer[] = {
+ // CECPQ2 is not yet enabled by default.
+ // SSL_CURVE_CECPQ2,
+ SSL_CURVE_X25519,
+ SSL_CURVE_SECP256R1,
+ SSL_CURVE_SECP384R1,
+};;
+
Span<const uint16_t> tls1_get_grouplist(const SSL_HANDSHAKE *hs) {
if (!hs->config->supported_group_list.empty()) {
return hs->config->supported_group_list;
}
+ if (hs->ssl->server) {
+ return Span<const uint16_t>(kDefaultGroupsServer);
+ }
return Span<const uint16_t>(kDefaultGroups);
}
@@ -324,7 +337,11 @@ bool tls1_get_shared_group(SSL_HANDSHAKE *hs, uint16_t *out_group_id) {
for (uint16_t pref_group : pref) {
for (uint16_t supp_group : supp) {
- if (pref_group == supp_group) {
+ if (pref_group == supp_group &&
+ // CECPQ2 doesn't fit in the u8-length-prefixed ECPoint field in TLS
+ // 1.2 and below.
+ (ssl_protocol_version(ssl) >= TLS1_3_VERSION ||
+ pref_group != SSL_CURVE_CECPQ2)) {
*out_group_id = pref_group;
return true;
}
@@ -386,6 +403,12 @@ bool tls1_set_curves_list(Array<uint16_t> *out_group_ids, const char *curves) {
}
bool tls1_check_group_id(const SSL_HANDSHAKE *hs, uint16_t group_id) {
+ if (group_id == SSL_CURVE_CECPQ2 &&
+ ssl_protocol_version(hs->ssl) < TLS1_3_VERSION) {
+ // CECPQ2 requires TLS 1.3.
+ return false;
+ }
+
for (uint16_t supported : tls1_get_grouplist(hs)) {
if (supported == group_id) {
return true;
@@ -1038,7 +1061,6 @@ static bool ext_sigalgs_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
CBS supported_signature_algorithms;
if (!CBS_get_u16_length_prefixed(contents, &supported_signature_algorithms) ||
CBS_len(contents) != 0 ||
- CBS_len(&supported_signature_algorithms) == 0 ||
!tls1_parse_peer_sigalgs(hs, &supported_signature_algorithms)) {
return false;
}
@@ -2145,6 +2167,7 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
}
uint16_t group_id = hs->retry_group;
+ uint16_t second_group_id = 0;
if (hs->received_hello_retry_request) {
// We received a HelloRetryRequest without a new curve, so there is no new
// share to append. Leave |hs->key_share| as-is.
@@ -2175,19 +2198,38 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
}
group_id = groups[0];
+
+ if (group_id == SSL_CURVE_CECPQ2 && groups.size() >= 2) {
+ // CECPQ2 is not sent as the only initial key share. We'll include the
+ // 2nd preference group too to avoid round-trips.
+ second_group_id = groups[1];
+ assert(second_group_id != group_id);
+ }
}
- hs->key_share = SSLKeyShare::Create(group_id);
CBB key_exchange;
- if (!hs->key_share ||
+ hs->key_shares[0] = SSLKeyShare::Create(group_id);
+ if (!hs->key_shares[0] ||
!CBB_add_u16(&kse_bytes, group_id) ||
!CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) ||
- !hs->key_share->Offer(&key_exchange) ||
+ !hs->key_shares[0]->Offer(&key_exchange) ||
!CBB_flush(&kse_bytes)) {
return false;
}
- // Save the contents of the extension to repeat it in the second ClientHello.
+ if (second_group_id != 0) {
+ hs->key_shares[1] = SSLKeyShare::Create(second_group_id);
+ if (!hs->key_shares[1] ||
+ !CBB_add_u16(&kse_bytes, second_group_id) ||
+ !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) ||
+ !hs->key_shares[1]->Offer(&key_exchange) ||
+ !CBB_flush(&kse_bytes)) {
+ return false;
+ }
+ }
+
+ // Save the contents of the extension to repeat it in the second
+ // ClientHello.
if (!hs->received_hello_retry_request &&
!hs->key_share_bytes.CopyFrom(
MakeConstSpan(CBB_data(&kse_bytes), CBB_len(&kse_bytes)))) {
@@ -2210,19 +2252,24 @@ bool ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs,
return false;
}
- if (hs->key_share->GroupID() != group_id) {
- *out_alert = SSL_AD_ILLEGAL_PARAMETER;
- OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE);
- return false;
+ SSLKeyShare *key_share = hs->key_shares[0].get();
+ if (key_share->GroupID() != group_id) {
+ if (!hs->key_shares[1] || hs->key_shares[1]->GroupID() != group_id) {
+ *out_alert = SSL_AD_ILLEGAL_PARAMETER;
+ OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE);
+ return false;
+ }
+ key_share = hs->key_shares[1].get();
}
- if (!hs->key_share->Finish(out_secret, out_alert, peer_key)) {
+ if (!key_share->Finish(out_secret, out_alert, peer_key)) {
*out_alert = SSL_AD_INTERNAL_ERROR;
return false;
}
hs->new_session->group_id = group_id;
- hs->key_share.reset();
+ hs->key_shares[0].reset();
+ hs->key_shares[1].reset();
return true;
}
@@ -2390,6 +2437,10 @@ static bool ext_supported_groups_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) {
}
for (uint16_t group : tls1_get_grouplist(hs)) {
+ if (group == SSL_CURVE_CECPQ2 &&
+ hs->max_version < TLS1_3_VERSION) {
+ continue;
+ }
if (!CBB_add_u16(&groups_bytes, group)) {
return false;
}
@@ -3556,7 +3607,10 @@ bool tls1_parse_peer_sigalgs(SSL_HANDSHAKE *hs, const CBS *in_sigalgs) {
return true;
}
- return parse_u16_array(in_sigalgs, &hs->peer_sigalgs);
+ // In all contexts, the signature algorithms list may not be empty. (It may be
+ // omitted by clients in TLS 1.2, but then the entire extension is omitted.)
+ return CBS_len(in_sigalgs) != 0 &&
+ parse_u16_array(in_sigalgs, &hs->peer_sigalgs);
}
bool tls1_get_legacy_signature_algorithm(uint16_t *out, const EVP_PKEY *pkey) {
diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc
index 675a08a0..77ed7968 100644
--- a/src/ssl/test/bssl_shim.cc
+++ b/src/ssl/test/bssl_shim.cc
@@ -649,7 +649,6 @@ static bool DoConnection(bssl::UniquePtr<SSL_SESSION> *out_session,
SSL_set_connect_state(ssl.get());
}
-
int sock = Connect(config->port);
if (sock == -1) {
return false;
@@ -837,6 +836,23 @@ static bool DoExchange(bssl::UniquePtr<SSL_SESSION> *out_session,
}
}
+ if (config->export_traffic_secrets) {
+ bssl::Span<const uint8_t> read_secret, write_secret;
+ if (!SSL_get_traffic_secrets(ssl, &read_secret, &write_secret)) {
+ fprintf(stderr, "failed to export traffic secrets\n");
+ return false;
+ }
+
+ assert(read_secret.size() <= 0xffff);
+ assert(write_secret.size() == read_secret.size());
+ const uint16_t secret_len = read_secret.size();
+ if (WriteAll(ssl, &secret_len, sizeof(secret_len)) < 0 ||
+ WriteAll(ssl, read_secret.data(), read_secret.size()) < 0 ||
+ WriteAll(ssl, write_secret.data(), write_secret.size()) < 0) {
+ return false;
+ }
+ }
+
if (config->tls_unique) {
uint8_t tls_unique[16];
size_t tls_unique_len;
diff --git a/src/ssl/test/runner/cipher_suites.go b/src/ssl/test/runner/cipher_suites.go
index f4c59006..3246f0b7 100644
--- a/src/ssl/test/runner/cipher_suites.go
+++ b/src/ssl/test/runner/cipher_suites.go
@@ -26,7 +26,7 @@ type keyAgreement interface {
// In the case that the key agreement protocol doesn't use a
// ServerKeyExchange message, generateServerKeyExchange can return nil,
// nil.
- generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg) (*serverKeyExchangeMsg, error)
+ generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg, uint16) (*serverKeyExchangeMsg, error)
processClientKeyExchange(*Config, *Certificate, *clientKeyExchangeMsg, uint16) ([]byte, error)
// On the client side, the next two methods are called in order.
diff --git a/src/ssl/test/runner/common.go b/src/ssl/test/runner/common.go
index 73b8889e..d99518c9 100644
--- a/src/ssl/test/runner/common.go
+++ b/src/ssl/test/runner/common.go
@@ -163,6 +163,7 @@ const (
CurveP384 CurveID = 24
CurveP521 CurveID = 25
CurveX25519 CurveID = 29
+ CurveCECPQ2 CurveID = 16696
)
// TLS Elliptic Curve Point Formats
@@ -1645,6 +1646,18 @@ type ProtocolBugs struct {
// ExpectJDK11DowngradeRandom is whether the client should expect the
// server to send the JDK 11 downgrade signal.
ExpectJDK11DowngradeRandom bool
+
+ // FailIfHelloRetryRequested causes a handshake failure if a server requests a
+ // hello retry.
+ FailIfHelloRetryRequested bool
+
+ // FailedIfCECPQ2Offered will cause a server to reject a ClientHello if CECPQ2
+ // is supported.
+ FailIfCECPQ2Offered bool
+
+ // ExpectKeyShares, if not nil, lists (in order) the curves that a ClientHello
+ // should have key shares for.
+ ExpectedKeyShares []CurveID
}
func (c *Config) serverInit() {
@@ -1724,7 +1737,7 @@ func (c *Config) maxVersion(isDTLS bool) uint16 {
return ret
}
-var defaultCurvePreferences = []CurveID{CurveX25519, CurveP256, CurveP384, CurveP521}
+var defaultCurvePreferences = []CurveID{CurveCECPQ2, CurveX25519, CurveP256, CurveP384, CurveP521}
func (c *Config) curvePreferences() []CurveID {
if c == nil || len(c.CurvePreferences) == 0 {
diff --git a/src/ssl/test/runner/handshake_client.go b/src/ssl/test/runner/handshake_client.go
index ab1f4dd2..5234462d 100644
--- a/src/ssl/test/runner/handshake_client.go
+++ b/src/ssl/test/runner/handshake_client.go
@@ -549,6 +549,9 @@ NextCipherSuite:
helloRetryRequest, haveHelloRetryRequest := msg.(*helloRetryRequestMsg)
var secondHelloBytes []byte
if haveHelloRetryRequest {
+ if c.config.Bugs.FailIfHelloRetryRequested {
+ return errors.New("tls: unexpected HelloRetryRequest")
+ }
// Explicitly read the ChangeCipherSpec now; it should
// be attached to the first flight, not the second flight.
if err := c.readTLS13ChangeCipherSpec(); err != nil {
diff --git a/src/ssl/test/runner/handshake_messages.go b/src/ssl/test/runner/handshake_messages.go
index e0867a51..823c6c8f 100644
--- a/src/ssl/test/runner/handshake_messages.go
+++ b/src/ssl/test/runner/handshake_messages.go
@@ -653,6 +653,23 @@ func parseSignatureAlgorithms(reader *byteReader, out *[]signatureAlgorithm, all
return true
}
+func checkDuplicateExtensions(extensions byteReader) bool {
+ seen := make(map[uint16]struct{})
+ for len(extensions) > 0 {
+ var extension uint16
+ var body byteReader
+ if !extensions.readU16(&extension) ||
+ !extensions.readU16LengthPrefixed(&body) {
+ return false
+ }
+ if _, ok := seen[extension]; ok {
+ return false
+ }
+ seen[extension] = struct{}{}
+ }
+ return true
+}
+
func (m *clientHelloMsg) unmarshal(data []byte) bool {
m.raw = data
reader := byteReader(data[4:])
@@ -707,7 +724,7 @@ func (m *clientHelloMsg) unmarshal(data []byte) bool {
}
var extensions byteReader
- if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 {
+ if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 || !checkDuplicateExtensions(extensions) {
return false
}
for len(extensions) > 0 {
@@ -923,6 +940,13 @@ func (m *clientHelloMsg) unmarshal(data []byte) bool {
seen[algID] = struct{}{}
m.compressedCertAlgs = append(m.compressedCertAlgs, algID)
}
+ case extensionPadding:
+ // Padding bytes must be all zero.
+ for _, b := range body {
+ if b != 0 {
+ return false
+ }
+ }
}
if isGREASEValue(extension) {
@@ -1067,7 +1091,7 @@ func (m *serverHelloMsg) unmarshal(data []byte) bool {
}
var extensions byteReader
- if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 {
+ if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 || !checkDuplicateExtensions(extensions) {
return false
}
@@ -1330,6 +1354,10 @@ func (m *serverExtensions) unmarshal(data byteReader, version uint16) bool {
// Reset all fields.
*m = serverExtensions{}
+ if !checkDuplicateExtensions(data) {
+ return false
+ }
+
for len(data) > 0 {
var extension uint16
var body byteReader
@@ -1651,7 +1679,7 @@ func (m *certificateMsg) unmarshal(data []byte) bool {
}
if m.hasRequestContext {
var extensions byteReader
- if !certs.readU16LengthPrefixed(&extensions) {
+ if !certs.readU16LengthPrefixed(&extensions) || !checkDuplicateExtensions(extensions) {
return false
}
for len(extensions) > 0 {
@@ -2010,7 +2038,8 @@ func (m *certificateRequestMsg) unmarshal(data []byte) bool {
var extensions byteReader
if !reader.readU8LengthPrefixedBytes(&m.requestContext) ||
!reader.readU16LengthPrefixed(&extensions) ||
- len(reader) != 0 {
+ len(reader) != 0 ||
+ !checkDuplicateExtensions(extensions) {
return false
}
for len(extensions) > 0 {
diff --git a/src/ssl/test/runner/handshake_server.go b/src/ssl/test/runner/handshake_server.go
index 6a752421..5486342a 100644
--- a/src/ssl/test/runner/handshake_server.go
+++ b/src/ssl/test/runner/handshake_server.go
@@ -208,6 +208,26 @@ func (hs *serverHandshakeState) readClientHello() error {
}
}
+ if config.Bugs.FailIfCECPQ2Offered {
+ for _, offeredCurve := range hs.clientHello.supportedCurves {
+ if offeredCurve == CurveCECPQ2 {
+ return errors.New("tls: CECPQ2 was offered")
+ }
+ }
+ }
+
+ if expected := config.Bugs.ExpectedKeyShares; expected != nil {
+ if len(expected) != len(hs.clientHello.keyShares) {
+ return fmt.Errorf("tls: expected %d key shares, but found %d", len(expected), len(hs.clientHello.keyShares))
+ }
+
+ for i, group := range expected {
+ if found := hs.clientHello.keyShares[i].group; found != group {
+ return fmt.Errorf("tls: key share #%d is for group %d, not %d", i, found, group)
+ }
+ }
+ }
+
c.clientVersion = hs.clientHello.vers
// Use the versions extension if supplied, otherwise use the legacy ClientHello version.
@@ -1212,6 +1232,11 @@ func (hs *serverHandshakeState) processClientHello() (isResume bool, err error)
preferredCurves := config.curvePreferences()
Curves:
for _, curve := range hs.clientHello.supportedCurves {
+ if curve == CurveCECPQ2 && c.vers < VersionTLS13 {
+ // CECPQ2 is TLS 1.3-only.
+ continue
+ }
+
for _, supported := range preferredCurves {
if supported == curve {
supportedCurve = true
@@ -1621,7 +1646,7 @@ func (hs *serverHandshakeState) doFullHandshake() error {
}
keyAgreement := hs.suite.ka(c.vers)
- skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello)
+ skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello, c.vers)
if err != nil {
c.sendAlert(alertHandshakeFailure)
return err
diff --git a/src/ssl/test/runner/hrss/hrss.go b/src/ssl/test/runner/hrss/hrss.go
new file mode 100644
index 00000000..9f4fdd77
--- /dev/null
+++ b/src/ssl/test/runner/hrss/hrss.go
@@ -0,0 +1,1212 @@
+package hrss
+
+import (
+ "crypto/hmac"
+ "crypto/sha256"
+ "crypto/subtle"
+ "encoding/binary"
+ "io"
+ "math/bits"
+)
+
+const (
+ PublicKeySize = modQBytes
+ CiphertextSize = modQBytes
+)
+
+const (
+ N = 701
+ Q = 8192
+ mod3Bytes = 140
+ modQBytes = 1138
+)
+
+const (
+ bitsPerWord = bits.UintSize
+ wordsPerPoly = (N + bitsPerWord - 1) / bitsPerWord
+ fullWordsPerPoly = N / bitsPerWord
+ bitsInLastWord = N % bitsPerWord
+)
+
+// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
+// bitsliced across the |s| and |a| arrays, like this:
+//
+// s | a | value
+// -----------------
+// 0 | 0 | 0
+// 0 | 1 | 1
+// 1 | 0 | 2 (aka -1)
+// 1 | 1 | <invalid>
+//
+// ('s' is for sign, and 'a' is just a letter.)
+//
+// Once bitsliced as such, the following circuits can be used to implement
+// addition and multiplication mod 3:
+//
+// (s3, a3) = (s1, a1) × (s2, a2)
+// s3 = (s2 ∧ a1) ⊕ (s1 ∧ a2)
+// a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2)
+//
+// (s3, a3) = (s1, a1) + (s2, a2)
+// t1 = ~(s1 ∨ a1)
+// t2 = ~(s2 ∨ a2)
+// s3 = (a1 ∧ a2) ⊕ (t1 ∧ s2) ⊕ (t2 ∧ s1)
+// a3 = (s1 ∧ s2) ⊕ (t1 ∧ a2) ⊕ (t2 ∧ a1)
+//
+// Negating a value just involves swapping s and a.
+type poly3 struct {
+ s [wordsPerPoly]uint
+ a [wordsPerPoly]uint
+}
+
+func (p *poly3) trim() {
+ p.s[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1
+ p.a[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1
+}
+
+func (p *poly3) zero() {
+ for i := range p.a {
+ p.s[i] = 0
+ p.a[i] = 0
+ }
+}
+
+func (p *poly3) fromDiscrete(in *poly) {
+ var shift uint
+ s := p.s[:]
+ a := p.a[:]
+ s[0] = 0
+ a[0] = 0
+
+ for _, v := range in {
+ s[0] >>= 1
+ s[0] |= uint((v>>1)&1) << (bitsPerWord - 1)
+ a[0] >>= 1
+ a[0] |= uint(v&1) << (bitsPerWord - 1)
+ shift++
+ if shift == bitsPerWord {
+ s = s[1:]
+ a = a[1:]
+ s[0] = 0
+ a[0] = 0
+ shift = 0
+ }
+ }
+
+ a[0] >>= bitsPerWord - shift
+ s[0] >>= bitsPerWord - shift
+}
+
+func (p *poly3) fromModQ(in *poly) int {
+ var shift uint
+ s := p.s[:]
+ a := p.a[:]
+ s[0] = 0
+ a[0] = 0
+ ok := 1
+
+ for _, v := range in {
+ vMod3, vOk := modQToMod3(v)
+ ok &= vOk
+
+ s[0] >>= 1
+ s[0] |= uint((vMod3>>1)&1) << (bitsPerWord - 1)
+ a[0] >>= 1
+ a[0] |= uint(vMod3&1) << (bitsPerWord - 1)
+ shift++
+ if shift == bitsPerWord {
+ s = s[1:]
+ a = a[1:]
+ s[0] = 0
+ a[0] = 0
+ shift = 0
+ }
+ }
+
+ a[0] >>= bitsPerWord - shift
+ s[0] >>= bitsPerWord - shift
+
+ return ok
+}
+
+func (p *poly3) fromDiscreteMod3(in *poly) {
+ var shift uint
+ s := p.s[:]
+ a := p.a[:]
+ s[0] = 0
+ a[0] = 0
+
+ for _, v := range in {
+ // This duplicates the 13th bit upwards to the top of the
+ // uint16, essentially treating it as a sign bit and converting
+ // into a signed int16. The signed value is reduced mod 3,
+ // yeilding {-2, -1, 0, 1, 2}.
+ v = uint16((int16(v<<3)>>3)%3) & 7
+
+ // We want to map v thus:
+ // {-2, -1, 0, 1, 2} -> {1, 2, 0, 1, 2}. We take the bottom
+ // three bits and then the constants below, when shifted by
+ // those three bits, perform the required mapping.
+ s[0] >>= 1
+ s[0] |= (0xbc >> v) << (bitsPerWord - 1)
+ a[0] >>= 1
+ a[0] |= (0x7a >> v) << (bitsPerWord - 1)
+ shift++
+ if shift == bitsPerWord {
+ s = s[1:]
+ a = a[1:]
+ s[0] = 0
+ a[0] = 0
+ shift = 0
+ }
+ }
+
+ a[0] >>= bitsPerWord - shift
+ s[0] >>= bitsPerWord - shift
+}
+
+func (p *poly3) marshal(out []byte) {
+ s := p.s[:]
+ a := p.a[:]
+ sw := s[0]
+ aw := a[0]
+ var shift int
+
+ for i := 0; i < 700; i += 5 {
+ acc, scale := 0, 1
+ for j := 0; j < 5; j++ {
+ v := int(aw&1) | int(sw&1)<<1
+ acc += scale * v
+ scale *= 3
+
+ shift++
+ if shift == bitsPerWord {
+ s = s[1:]
+ a = a[1:]
+ sw = s[0]
+ aw = a[0]
+ shift = 0
+ } else {
+ sw >>= 1
+ aw >>= 1
+ }
+ }
+
+ out[0] = byte(acc)
+ out = out[1:]
+ }
+}
+
+func (p *poly) fromMod2(in *poly2) {
+ var shift uint
+ words := in[:]
+ word := words[0]
+
+ for i := range p {
+ p[i] = uint16(word & 1)
+ word >>= 1
+ shift++
+ if shift == bitsPerWord {
+ words = words[1:]
+ word = words[0]
+ shift = 0
+ }
+ }
+}
+
+func (p *poly) fromMod3(in *poly3) {
+ var shift uint
+ s := in.s[:]
+ a := in.a[:]
+ sw := s[0]
+ aw := a[0]
+
+ for i := range p {
+ p[i] = uint16(aw&1 | (sw&1)<<1)
+ aw >>= 1
+ sw >>= 1
+ shift++
+ if shift == bitsPerWord {
+ a = a[1:]
+ s = s[1:]
+ aw = a[0]
+ sw = s[0]
+ shift = 0
+ }
+ }
+}
+
+func (p *poly) fromMod3ToModQ(in *poly3) {
+ var shift uint
+ s := in.s[:]
+ a := in.a[:]
+ sw := s[0]
+ aw := a[0]
+
+ for i := range p {
+ p[i] = mod3ToModQ(uint16(aw&1 | (sw&1)<<1))
+ aw >>= 1
+ sw >>= 1
+ shift++
+ if shift == bitsPerWord {
+ a = a[1:]
+ s = s[1:]
+ aw = a[0]
+ sw = s[0]
+ shift = 0
+ }
+ }
+}
+
+func lsbToAll(v uint) uint {
+ return uint(int(v<<(bitsPerWord-1)) >> (bitsPerWord - 1))
+}
+
+func (p *poly3) mulConst(ms, ma uint) {
+ ms = lsbToAll(ms)
+ ma = lsbToAll(ma)
+
+ for i := range p.a {
+ p.s[i], p.a[i] = (ma&p.s[i])^(ms&p.a[i]), (ma&p.a[i])^(ms&p.s[i])
+ }
+}
+
+func cmovWords(out, in *[wordsPerPoly]uint, mov uint) {
+ for i := range out {
+ out[i] = (out[i] & ^mov) | (in[i] & mov)
+ }
+}
+
+func rotWords(out, in *[wordsPerPoly]uint, bits uint) {
+ start := bits / bitsPerWord
+ n := (N - bits) / bitsPerWord
+
+ for i := uint(0); i < n; i++ {
+ out[i] = in[start+i]
+ }
+
+ carry := in[wordsPerPoly-1]
+
+ for i := uint(0); i < start; i++ {
+ out[n+i] = carry | in[i]<<bitsInLastWord
+ carry = in[i] >> (bitsPerWord - bitsInLastWord)
+ }
+
+ out[wordsPerPoly-1] = carry
+}
+
+// rotBits right-rotates the bits in |in|. bits must be a non-zero power of two
+// and less than bitsPerWord.
+func rotBits(out, in *[wordsPerPoly]uint, bits uint) {
+ if (bits == 0 || (bits & (bits - 1)) != 0 || bits > bitsPerWord/2 || bitsInLastWord < bitsPerWord/2) {
+ panic("internal error");
+ }
+
+ carry := in[wordsPerPoly-1] << (bitsPerWord - bits)
+
+ for i := wordsPerPoly - 2; i >= 0; i-- {
+ out[i] = carry | in[i]>>bits
+ carry = in[i] << (bitsPerWord - bits)
+ }
+
+ out[wordsPerPoly-1] = carry>>(bitsPerWord-bitsInLastWord) | in[wordsPerPoly-1]>>bits
+}
+
+func (p *poly3) rotWords(bits uint, in *poly3) {
+ rotWords(&p.s, &in.s, bits)
+ rotWords(&p.a, &in.a, bits)
+}
+
+func (p *poly3) rotBits(bits uint, in *poly3) {
+ rotBits(&p.s, &in.s, bits)
+ rotBits(&p.a, &in.a, bits)
+}
+
+func (p *poly3) cmov(in *poly3, mov uint) {
+ cmovWords(&p.s, &in.s, mov)
+ cmovWords(&p.a, &in.a, mov)
+}
+
+func (p *poly3) rot(bits uint) {
+ if bits > N {
+ panic("invalid")
+ }
+ var shifted poly3
+
+ shift := uint(9)
+ for ; (1 << shift) >= bitsPerWord; shift-- {
+ shifted.rotWords(1<<shift, p)
+ p.cmov(&shifted, lsbToAll(bits>>shift))
+ }
+ for ; shift < 9; shift-- {
+ shifted.rotBits(1<<shift, p)
+ p.cmov(&shifted, lsbToAll(bits>>shift))
+ }
+}
+
+func (p *poly3) fmadd(ms, ma uint, in *poly3) {
+ ms = lsbToAll(ms)
+ ma = lsbToAll(ma)
+
+ for i := range p.a {
+ products := (ma & in.s[i]) ^ (ms & in.a[i])
+ producta := (ma & in.a[i]) ^ (ms & in.s[i])
+
+ ns1Ana1 := ^p.s[i] & ^p.a[i]
+ ns2Ana2 := ^products & ^producta
+
+ p.s[i], p.a[i] = (p.a[i]&producta)^(ns1Ana1&products)^(p.s[i]&ns2Ana2), (p.s[i]&products)^(ns1Ana1&producta)^(p.a[i]&ns2Ana2)
+ }
+}
+
+func (p *poly3) modPhiN() {
+ factora := uint(int(p.s[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1))
+ factors := uint(int(p.a[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1))
+ ns2Ana2 := ^factors & ^factora
+
+ for i := range p.s {
+ ns1Ana1 := ^p.s[i] & ^p.a[i]
+ p.s[i], p.a[i] = (p.a[i]&factora)^(ns1Ana1&factors)^(p.s[i]&ns2Ana2), (p.s[i]&factors)^(ns1Ana1&factora)^(p.a[i]&ns2Ana2)
+ }
+}
+
+func (p *poly3) cswap(other *poly3, swap uint) {
+ for i := range p.s {
+ sums := swap & (p.s[i] ^ other.s[i])
+ p.s[i] ^= sums
+ other.s[i] ^= sums
+
+ suma := swap & (p.a[i] ^ other.a[i])
+ p.a[i] ^= suma
+ other.a[i] ^= suma
+ }
+}
+
+func (p *poly3) mulx() {
+ carrys := (p.s[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1
+ carrya := (p.a[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1
+
+ for i := range p.s {
+ outCarrys := p.s[i] >> (bitsPerWord - 1)
+ outCarrya := p.a[i] >> (bitsPerWord - 1)
+ p.s[i] <<= 1
+ p.a[i] <<= 1
+ p.s[i] |= carrys
+ p.a[i] |= carrya
+ carrys = outCarrys
+ carrya = outCarrya
+ }
+}
+
+func (p *poly3) divx() {
+ var carrys, carrya uint
+
+ for i := len(p.s) - 1; i >= 0; i-- {
+ outCarrys := p.s[i] & 1
+ outCarrya := p.a[i] & 1
+ p.s[i] >>= 1
+ p.a[i] >>= 1
+ p.s[i] |= carrys << (bitsPerWord - 1)
+ p.a[i] |= carrya << (bitsPerWord - 1)
+ carrys = outCarrys
+ carrya = outCarrya
+ }
+}
+
+type poly2 [wordsPerPoly]uint
+
+func (p *poly2) fromDiscrete(in *poly) {
+ var shift uint
+ words := p[:]
+ words[0] = 0
+
+ for _, v := range in {
+ words[0] >>= 1
+ words[0] |= uint(v&1) << (bitsPerWord - 1)
+ shift++
+ if shift == bitsPerWord {
+ words = words[1:]
+ words[0] = 0
+ shift = 0
+ }
+ }
+
+ words[0] >>= bitsPerWord - shift
+}
+
+func (p *poly2) setPhiN() {
+ for i := range p {
+ p[i] = ^uint(0)
+ }
+ p[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1
+}
+
+func (p *poly2) cswap(other *poly2, swap uint) {
+ for i := range p {
+ sum := swap & (p[i] ^ other[i])
+ p[i] ^= sum
+ other[i] ^= sum
+ }
+}
+
+func (p *poly2) fmadd(m uint, in *poly2) {
+ m = ^(m - 1)
+
+ for i := range p {
+ p[i] ^= in[i] & m
+ }
+}
+
+func (p *poly2) lshift1() {
+ var carry uint
+ for i := range p {
+ nextCarry := p[i] >> (bitsPerWord - 1)
+ p[i] <<= 1
+ p[i] |= carry
+ carry = nextCarry
+ }
+}
+
+func (p *poly2) rshift1() {
+ var carry uint
+ for i := len(p) - 1; i >= 0; i-- {
+ nextCarry := p[i] & 1
+ p[i] >>= 1
+ p[i] |= carry << (bitsPerWord - 1)
+ carry = nextCarry
+ }
+}
+
+func (p *poly2) rot(bits uint) {
+ if bits > N {
+ panic("invalid")
+ }
+ var shifted [wordsPerPoly]uint
+ out := (*[wordsPerPoly]uint)(p)
+
+ shift := uint(9)
+ for ; (1 << shift) >= bitsPerWord; shift-- {
+ rotWords(&shifted, out, 1<<shift)
+ cmovWords(out, &shifted, lsbToAll(bits>>shift))
+ }
+ for ; shift < 9; shift-- {
+ rotBits(&shifted, out, 1<<shift)
+ cmovWords(out, &shifted, lsbToAll(bits>>shift))
+ }
+}
+
+type poly [N]uint16
+
+func (in *poly) marshal(out []byte) {
+ p := in[:]
+
+ for len(p) >= 8 {
+ out[0] = byte(p[0])
+ out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5)
+ out[2] = byte(p[1] >> 3)
+ out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2)
+ out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7)
+ out[5] = byte(p[3] >> 1)
+ out[6] = byte(p[3]>>9) | byte((p[4]&0x0f)<<4)
+ out[7] = byte(p[4] >> 4)
+ out[8] = byte(p[4]>>12) | byte((p[5]&0x7f)<<1)
+ out[9] = byte(p[5]>>7) | byte((p[6]&0x03)<<6)
+ out[10] = byte(p[6] >> 2)
+ out[11] = byte(p[6]>>10) | byte((p[7]&0x1f)<<3)
+ out[12] = byte(p[7] >> 5)
+
+ p = p[8:]
+ out = out[13:]
+ }
+
+ // There are four remaining values.
+ out[0] = byte(p[0])
+ out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5)
+ out[2] = byte(p[1] >> 3)
+ out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2)
+ out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7)
+ out[5] = byte(p[3] >> 1)
+ out[6] = byte(p[3] >> 9)
+}
+
+func (out *poly) unmarshal(in []byte) bool {
+ p := out[:]
+ for i := 0; i < 87; i++ {
+ p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8
+ p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11
+ p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6
+ p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9
+ p[4] = uint16(in[6]>>4) | uint16(in[7])<<4 | uint16(in[8]&1)<<12
+ p[5] = uint16(in[8]>>1) | uint16(in[9]&0x3f)<<7
+ p[6] = uint16(in[9]>>6) | uint16(in[10])<<2 | uint16(in[11]&7)<<10
+ p[7] = uint16(in[11]>>3) | uint16(in[12])<<5
+
+ p = p[8:]
+ in = in[13:]
+ }
+
+ // There are four coefficients left over
+ p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8
+ p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11
+ p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6
+ p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9
+
+ if in[6]&0xf0 != 0 {
+ return false
+ }
+
+ out[N-1] = 0
+ var top int
+ for _, v := range out {
+ top += int(v)
+ }
+
+ out[N-1] = uint16(-top) % Q
+ return true
+}
+
+func (in *poly) marshalS3(out []byte) {
+ p := in[:]
+ for len(p) >= 5 {
+ out[0] = byte(p[0] + p[1]*3 + p[2]*9 + p[3]*27 + p[4]*81)
+ out = out[1:]
+ p = p[5:]
+ }
+}
+
+func (out *poly) unmarshalS3(in []byte) bool {
+ p := out[:]
+ for i := 0; i < 140; i++ {
+ c := in[0]
+ if c >= 243 {
+ return false
+ }
+ p[0] = uint16(c % 3)
+ p[1] = uint16((c / 3) % 3)
+ p[2] = uint16((c / 9) % 3)
+ p[3] = uint16((c / 27) % 3)
+ p[4] = uint16((c / 81) % 3)
+
+ p = p[5:]
+ in = in[1:]
+ }
+
+ out[N-1] = 0
+ return true
+}
+
+func (p *poly) modPhiN() {
+ for i := range p {
+ p[i] = (p[i] + Q - p[N-1]) % Q
+ }
+}
+
+func (out *poly) shortSample(in []byte) {
+ // b a result
+ // 00 00 00
+ // 00 01 01
+ // 00 10 10
+ // 00 11 11
+ // 01 00 10
+ // 01 01 00
+ // 01 10 01
+ // 01 11 11
+ // 10 00 01
+ // 10 01 10
+ // 10 10 00
+ // 10 11 11
+ // 11 00 11
+ // 11 01 11
+ // 11 10 11
+ // 11 11 11
+
+ // 1111 1111 1100 1001 1101 0010 1110 0100
+ // f f c 9 d 2 e 4
+ const lookup = uint32(0xffc9d2e4)
+
+ p := out[:]
+ for i := 0; i < 87; i++ {
+ v := binary.LittleEndian.Uint32(in)
+ v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555)
+ for j := 0; j < 8; j++ {
+ p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3)
+ v2 >>= 4
+ }
+ p = p[8:]
+ in = in[4:]
+ }
+
+ // There are four values remaining.
+ v := binary.LittleEndian.Uint32(in)
+ v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555)
+ for j := 0; j < 4; j++ {
+ p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3)
+ v2 >>= 4
+ }
+
+ out[N-1] = 0
+}
+
+func (out *poly) shortSamplePlus(in []byte) {
+ out.shortSample(in)
+
+ var sum uint16
+ for i := 0; i < N-1; i++ {
+ sum += mod3ResultToModQ(out[i] * out[i+1])
+ }
+
+ scale := 1 + (1 & (sum >> 12))
+ for i := 0; i < len(out); i += 2 {
+ out[i] = (out[i] * scale) % 3
+ }
+}
+
+func mul(out, scratch, a, b []uint16) {
+ const schoolbookLimit = 32
+ if len(a) < schoolbookLimit {
+ for i := 0; i < len(a)*2; i++ {
+ out[i] = 0
+ }
+ for i := range a {
+ for j := range b {
+ out[i+j] += a[i] * b[j]
+ }
+ }
+ return
+ }
+
+ lowLen := len(a) / 2
+ highLen := len(a) - lowLen
+ aLow, aHigh := a[:lowLen], a[lowLen:]
+ bLow, bHigh := b[:lowLen], b[lowLen:]
+
+ for i := 0; i < lowLen; i++ {
+ out[i] = aHigh[i] + aLow[i]
+ }
+ if highLen != lowLen {
+ out[lowLen] = aHigh[lowLen]
+ }
+
+ for i := 0; i < lowLen; i++ {
+ out[highLen+i] = bHigh[i] + bLow[i]
+ }
+ if highLen != lowLen {
+ out[highLen+lowLen] = bHigh[lowLen]
+ }
+
+ mul(scratch, scratch[2*highLen:], out[:highLen], out[highLen:highLen*2])
+ mul(out[lowLen*2:], scratch[2*highLen:], aHigh, bHigh)
+ mul(out, scratch[2*highLen:], aLow, bLow)
+
+ for i := 0; i < lowLen*2; i++ {
+ scratch[i] -= out[i] + out[lowLen*2+i]
+ }
+ if lowLen != highLen {
+ scratch[lowLen*2] -= out[lowLen*4]
+ }
+
+ for i := 0; i < 2*highLen; i++ {
+ out[lowLen+i] += scratch[i]
+ }
+}
+
+func (out *poly) mul(a, b *poly) {
+ var prod, scratch [2 * N]uint16
+ mul(prod[:], scratch[:], a[:], b[:])
+ for i := range out {
+ out[i] = (prod[i] + prod[i+N]) % Q
+ }
+}
+
+func (p3 *poly3) mulMod3(x, y *poly3) {
+ // (𝑥^n - 1) is a multiple of Φ(N) so we can work mod (𝑥^n - 1) here and
+ // (reduce mod Φ(N) afterwards.
+ x3 := *x
+ y3 := *y
+ s := x3.s[:]
+ a := x3.a[:]
+ sw := s[0]
+ aw := a[0]
+ p3.zero()
+ var shift uint
+ for i := 0; i < N; i++ {
+ p3.fmadd(sw, aw, &y3)
+ sw >>= 1
+ aw >>= 1
+ shift++
+ if shift == bitsPerWord {
+ s = s[1:]
+ a = a[1:]
+ sw = s[0]
+ aw = a[0]
+ shift = 0
+ }
+ y3.mulx()
+ }
+ p3.modPhiN()
+}
+
+// mod3ToModQ maps {0, 1, 2, 3} to {0, 1, Q-1, 0xffff}
+// The case of n == 3 should never happen but is included so that modQToMod3
+// can easily catch invalid inputs.
+func mod3ToModQ(n uint16) uint16 {
+ return uint16(uint64(0xffff1fff00010000) >> (16 * n))
+}
+
+// modQToMod3 maps {0, 1, Q-1} to {(0, 0), (0, 1), (1, 0)} and also returns an int
+// which is one if the input is in range and zero otherwise.
+func modQToMod3(n uint16) (uint16, int) {
+ result := (n&3 - (n>>1)&1)
+ return result, subtle.ConstantTimeEq(int32(mod3ToModQ(result)), int32(n))
+}
+
+// mod3ResultToModQ maps {0, 1, 2, 4} to {0, 1, Q-1, 1}
+func mod3ResultToModQ(n uint16) uint16 {
+ return ((((uint16(0x13) >> n) & 1) - 1) & 0x1fff) | ((uint16(0x12) >> n) & 1)
+ //shift := (uint(0x324) >> (2 * n)) & 3
+ //return uint16(uint64(0x00011fff00010000) >> (16 * shift))
+}
+
+// mulXMinus1 sets out to a×(𝑥 - 1) mod (𝑥^n - 1)
+func (out *poly) mulXMinus1() {
+ // Multiplying by (𝑥 - 1) means negating each coefficient and adding in
+ // the value of the previous one.
+ origOut700 := out[700]
+
+ for i := N - 1; i > 0; i-- {
+ out[i] = (Q - out[i] + out[i-1]) % Q
+ }
+ out[0] = (Q - out[0] + origOut700) % Q
+}
+
+func (out *poly) lift(a *poly) {
+ // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
+ // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
+
+ // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
+ //
+ // R.<x> = PolynomialRing(GF(3)…)
+ // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
+ // list(inv)[:15]
+ // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
+ //
+ // This three-element pattern of coefficients repeats for the whole
+ // polynomial.
+ //
+ // Next define the overbar operator such that z̅ = z[0] +
+ // reverse(z[1:]). (Index zero of a polynomial here is the coefficient
+ // of the constant term. So index one is the coefficient of 𝑥 and so
+ // on.)
+ //
+ // A less odd way to define this is to see that z̅ negates the indexes,
+ // so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
+ //
+ // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
+ // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
+ // of the point-wise products.) Although we calculated the inverse mod
+ // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
+ // (That's because (𝑥^N - 1) is a multiple of Φ(N).)
+ //
+ // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
+ // of the list of coefficients.
+ //
+ // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
+ //
+ // def reverse(xs):
+ // suffix = list(xs[1:])
+ // suffix.reverse()
+ // return [xs[0]] + suffix
+ //
+ // def rotate(xs):
+ // return [xs[-1]] + xs[:-1]
+ //
+ // zoverbar = reverse(list(inv) + [0])
+ // xzoverbar = rotate(reverse(list(inv) + [0]))
+ // x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
+ //
+ // zoverbar[:15]
+ // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
+ // xzoverbar[:15]
+ // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
+ // x2zoverbar[:15]
+ // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
+ //
+ // (For a formula for z̅, see lemma two of appendix B.)
+ //
+ // After the first three elements have been taken care of, all then have
+ // a repeating three-element cycle. The next value (𝑥^3z̅) involves
+ // three rotations of the first pattern, thus the three-element cycle
+ // lines up. However, the discontinuity in the first three elements
+ // obviously moves to a different position. Consider the difference
+ // between 𝑥^3z̅ and z̅:
+ //
+ // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
+ // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ //
+ // This pattern of differences is the same for all elements, although it
+ // obviously moves right with the rotations.
+ //
+ // From this, we reach algorithm eight of appendix B.
+
+ // Handle the first three elements of the inner products.
+ out[0] = a[0] + a[2]
+ out[1] = a[1]
+ out[2] = 2*a[0] + a[2]
+
+ // Use the repeating pattern to complete the first three inner products.
+ for i := 3; i < 699; i += 3 {
+ out[0] += 2*a[i] + a[i+2]
+ out[1] += a[i] + 2*a[i+1]
+ out[2] += a[i+1] + 2*a[i+2]
+ }
+
+ // Handle the fact that the three-element pattern doesn't fill the
+ // polynomial exactly (since 701 isn't a multiple of three).
+ out[2] += a[700]
+ out[0] += 2 * a[699]
+ out[1] += a[699] + 2*a[700]
+
+ out[0] = out[0] % 3
+ out[1] = out[1] % 3
+ out[2] = out[2] % 3
+
+ // Calculate the remaining inner products by taking advantage of the
+ // fact that the pattern repeats every three cycles and the pattern of
+ // differences is moves with the rotation.
+ for i := 3; i < N; i++ {
+ // Add twice something is the same as subtracting when working
+ // mod 3. Doing it this way avoids underflow. Underflow is bad
+ // because "% 3" doesn't work correctly for negative numbers
+ // here since underflow will wrap to 2^16-1 and 2^16 isn't a
+ // multiple of three.
+ out[i] = (out[i-3] + 2*(a[i-2]+a[i-1]+a[i])) % 3
+ }
+
+ // Reduce mod Φ(N) by subtracting a multiple of out[700] from every
+ // element and convert to mod Q. (See above about adding twice as
+ // subtraction.)
+ v := out[700] * 2
+ for i := range out {
+ out[i] = mod3ToModQ((out[i] + v) % 3)
+ }
+
+ out.mulXMinus1()
+}
+
+func (a *poly) cswap(b *poly, swap uint16) {
+ for i := range a {
+ sum := swap & (a[i] ^ b[i])
+ a[i] ^= sum
+ b[i] ^= sum
+ }
+}
+
+func lt(a, b uint) uint {
+ if a < b {
+ return ^uint(0)
+ }
+ return 0
+}
+
+func bsMul(s1, a1, s2, a2 uint) (s3, a3 uint) {
+ s3 = (a1 & s2) ^ (s1 & a2)
+ a3 = (a1 & a2) ^ (s1 & s2)
+ return
+}
+
+func (out *poly3) invertMod3(in *poly3) {
+ // This algorithm follows algorithm 10 in the paper. (Although note that
+ // the paper appears to have a bug: k should start at zero, not one.)
+ // The best explanation for why it works is in the "Why it works"
+ // section of
+ // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf.
+ var k uint
+ degF, degG := uint(N-1), uint(N-1)
+
+ var b, c, g poly3
+ f := *in
+
+ for i := range g.a {
+ g.a[i] = ^uint(0)
+ }
+
+ b.a[0] = 1
+
+ var f0s, f0a uint
+ stillGoing := ^uint(0)
+ for i := 0; i < 2*(N-1)-1; i++ {
+ ss, sa := bsMul(f.s[0], f.a[0], g.s[0], g.a[0])
+ ss, sa = sa&stillGoing&1, ss&stillGoing&1
+ shouldSwap := ^uint(int((ss|sa)-1)>>(bitsPerWord-1)) & lt(degF, degG)
+ f.cswap(&g, shouldSwap)
+ b.cswap(&c, shouldSwap)
+ degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap)
+ f.fmadd(ss, sa, &g)
+ b.fmadd(ss, sa, &c)
+
+ f.divx()
+ f.s[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1
+ f.a[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1
+ c.mulx()
+ c.s[0] &= ^uint(1)
+ c.a[0] &= ^uint(1)
+
+ degF--
+ k += 1 & stillGoing
+ f0s = (stillGoing & f.s[0]) | (^stillGoing & f0s)
+ f0a = (stillGoing & f.a[0]) | (^stillGoing & f0a)
+ stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1))
+ }
+
+ k -= N & lt(N, k)
+ *out = b
+ out.rot(k)
+ out.mulConst(f0s, f0a)
+ out.modPhiN()
+}
+
+func (out *poly) invertMod2(a *poly) {
+ // This algorithm follows mix of algorithm 10 in the paper and the first
+ // page of the PDF linked below. (Although note that the paper appears
+ // to have a bug: k should start at zero, not one.) The best explanation
+ // for why it works is in the "Why it works" section of
+ // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf.
+ var k uint
+ degF, degG := uint(N-1), uint(N-1)
+
+ var f poly2
+ f.fromDiscrete(a)
+ var b, c, g poly2
+ g.setPhiN()
+ b[0] = 1
+
+ stillGoing := ^uint(0)
+ for i := 0; i < 2*(N-1)-1; i++ {
+ s := uint(f[0]&1) & stillGoing
+ shouldSwap := ^(s - 1) & lt(degF, degG)
+ f.cswap(&g, shouldSwap)
+ b.cswap(&c, shouldSwap)
+ degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap)
+ f.fmadd(s, &g)
+ b.fmadd(s, &c)
+
+ f.rshift1()
+ c.lshift1()
+
+ degF--
+ k += 1 & stillGoing
+ stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1))
+ }
+
+ k -= N & lt(N, k)
+ b.rot(k)
+ out.fromMod2(&b)
+}
+
+func (out *poly) invert(origA *poly) {
+ // Inversion mod Q, which is done based on the result of inverting mod
+ // 2. See the NTRU paper, page three.
+ var a, tmp, tmp2, b poly
+ b.invertMod2(origA)
+
+ // Negate a.
+ for i := range a {
+ a[i] = Q - origA[i]
+ }
+
+ // We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
+ // times, which is four.
+ for i := 0; i < 4; i++ {
+ tmp.mul(&a, &b)
+ tmp[0] += 2
+ tmp2.mul(&b, &tmp)
+ b = tmp2
+ }
+
+ *out = b
+}
+
+type PublicKey struct {
+ h poly
+}
+
+func ParsePublicKey(in []byte) (*PublicKey, bool) {
+ ret := new(PublicKey)
+ if !ret.h.unmarshal(in) {
+ return nil, false
+ }
+ return ret, true
+}
+
+func (pub *PublicKey) Marshal() []byte {
+ ret := make([]byte, modQBytes)
+ pub.h.marshal(ret)
+ return ret
+}
+
+func (pub *PublicKey) Encap(rand io.Reader) (ciphertext []byte, sharedKey []byte) {
+ var randBytes [352 + 352]byte
+ if _, err := io.ReadFull(rand, randBytes[:]); err != nil {
+ panic("rand failed")
+ }
+
+ var m, r poly
+ m.shortSample(randBytes[:352])
+ r.shortSample(randBytes[352:])
+
+ var mBytes, rBytes [mod3Bytes]byte
+ m.marshalS3(mBytes[:])
+ r.marshalS3(rBytes[:])
+
+ ciphertext = pub.owf(&m, &r)
+
+ h := sha256.New()
+ h.Write([]byte("shared key\x00"))
+ h.Write(mBytes[:])
+ h.Write(rBytes[:])
+ h.Write(ciphertext)
+ sharedKey = h.Sum(nil)
+
+ return ciphertext, sharedKey
+}
+
+func (pub *PublicKey) owf(m, r *poly) []byte {
+ for i := range r {
+ r[i] = mod3ToModQ(r[i])
+ }
+
+ var mq poly
+ mq.lift(m)
+
+ var e poly
+ e.mul(r, &pub.h)
+ for i := range e {
+ e[i] = (e[i] + mq[i]) % Q
+ }
+
+ ret := make([]byte, modQBytes)
+ e.marshal(ret[:])
+ return ret
+}
+
+type PrivateKey struct {
+ PublicKey
+ f, fp poly3
+ hInv poly
+ hmacKey [32]byte
+}
+
+func (priv *PrivateKey) Marshal() []byte {
+ var ret [2*mod3Bytes + modQBytes]byte
+ priv.f.marshal(ret[:])
+ priv.fp.marshal(ret[mod3Bytes:])
+ priv.h.marshal(ret[2*mod3Bytes:])
+ return ret[:]
+}
+
+func (priv *PrivateKey) Decap(ciphertext []byte) (sharedKey []byte, ok bool) {
+ if len(ciphertext) != modQBytes {
+ return nil, false
+ }
+
+ var e poly
+ if !e.unmarshal(ciphertext) {
+ return nil, false
+ }
+
+ var f poly
+ f.fromMod3ToModQ(&priv.f)
+
+ var v1, m poly
+ v1.mul(&e, &f)
+
+ var v13 poly3
+ v13.fromDiscreteMod3(&v1)
+ // Note: v13 is not reduced mod phi(n).
+
+ var m3 poly3
+ m3.mulMod3(&v13, &priv.fp)
+ m3.modPhiN()
+ m.fromMod3(&m3)
+
+ var mLift, delta poly
+ mLift.lift(&m)
+ for i := range delta {
+ delta[i] = (e[i] - mLift[i] + Q) % Q
+ }
+ delta.mul(&delta, &priv.hInv)
+ delta.modPhiN()
+
+ var r poly3
+ allOk := r.fromModQ(&delta)
+
+ var mBytes, rBytes [mod3Bytes]byte
+ m.marshalS3(mBytes[:])
+ r.marshal(rBytes[:])
+
+ var rPoly poly
+ rPoly.fromMod3(&r)
+ expectedCiphertext := priv.PublicKey.owf(&m, &rPoly)
+
+ allOk &= subtle.ConstantTimeCompare(ciphertext, expectedCiphertext)
+
+ hmacHash := hmac.New(sha256.New, priv.hmacKey[:])
+ hmacHash.Write(ciphertext)
+ hmacDigest := hmacHash.Sum(nil)
+
+ h := sha256.New()
+ h.Write([]byte("shared key\x00"))
+ h.Write(mBytes[:])
+ h.Write(rBytes[:])
+ h.Write(ciphertext)
+ sharedKey = h.Sum(nil)
+
+ mask := uint8(allOk - 1)
+ for i := range sharedKey {
+ sharedKey[i] = (sharedKey[i] & ^mask) | (hmacDigest[i] & mask)
+ }
+
+ return sharedKey, true
+}
+
+func GenerateKey(rand io.Reader) PrivateKey {
+ var randBytes [352 + 352]byte
+ if _, err := io.ReadFull(rand, randBytes[:]); err != nil {
+ panic("rand failed")
+ }
+
+ var f poly
+ f.shortSamplePlus(randBytes[:352])
+ var priv PrivateKey
+ priv.f.fromDiscrete(&f)
+ priv.fp.invertMod3(&priv.f)
+
+ var g poly
+ g.shortSamplePlus(randBytes[352:])
+
+ var pgPhi1 poly
+ for i := range g {
+ pgPhi1[i] = mod3ToModQ(g[i])
+ }
+ for i := range pgPhi1 {
+ pgPhi1[i] = (pgPhi1[i] * 3) % Q
+ }
+ pgPhi1.mulXMinus1()
+
+ var fModQ poly
+ fModQ.fromMod3ToModQ(&priv.f)
+
+ var pfgPhi1 poly
+ pfgPhi1.mul(&fModQ, &pgPhi1)
+
+ var i poly
+ i.invert(&pfgPhi1)
+
+ priv.h.mul(&i, &pgPhi1)
+ priv.h.mul(&priv.h, &pgPhi1)
+
+ priv.hInv.mul(&i, &fModQ)
+ priv.hInv.mul(&priv.hInv, &fModQ)
+
+ return priv
+}
diff --git a/src/ssl/test/runner/key_agreement.go b/src/ssl/test/runner/key_agreement.go
index 791325cd..f40552d9 100644
--- a/src/ssl/test/runner/key_agreement.go
+++ b/src/ssl/test/runner/key_agreement.go
@@ -17,6 +17,7 @@ import (
"boringssl.googlesource.com/boringssl/ssl/test/runner/curve25519"
"boringssl.googlesource.com/boringssl/ssl/test/runner/ed25519"
+ "boringssl.googlesource.com/boringssl/ssl/test/runner/hrss"
)
type keyType int
@@ -37,7 +38,7 @@ type rsaKeyAgreement struct {
exportKey *rsa.PrivateKey
}
-func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
// Save the client version for comparison later.
ka.clientVersion = clientHello.vers
@@ -347,6 +348,90 @@ func (e *x25519ECDHCurve) finish(peerKey []byte) (preMasterSecret []byte, err er
return out[:], nil
}
+// cecpq2Curve implements CECPQ2, which is HRSS+SXY combined with X25519.
+type cecpq2Curve struct {
+ x25519PrivateKey [32]byte
+ hrssPrivateKey hrss.PrivateKey
+}
+
+func (e *cecpq2Curve) offer(rand io.Reader) (publicKey []byte, err error) {
+ if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil {
+ return nil, err
+ }
+
+ var x25519Public [32]byte
+ curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey)
+
+ e.hrssPrivateKey = hrss.GenerateKey(rand)
+ hrssPublic := e.hrssPrivateKey.PublicKey.Marshal()
+
+ var ret []byte
+ ret = append(ret, x25519Public[:]...)
+ ret = append(ret, hrssPublic...)
+ return ret, nil
+}
+
+func (e *cecpq2Curve) accept(rand io.Reader, peerKey []byte) (publicKey []byte, preMasterSecret []byte, err error) {
+ if len(peerKey) != 32+hrss.PublicKeySize {
+ return nil, nil, errors.New("tls: bad length CECPQ2 offer")
+ }
+
+ if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil {
+ return nil, nil, err
+ }
+
+ var x25519Shared, x25519PeerKey, x25519Public [32]byte
+ copy(x25519PeerKey[:], peerKey)
+ curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey)
+ curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey)
+
+ // Per RFC 7748, reject the all-zero value in constant time.
+ var zeros [32]byte
+ if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 {
+ return nil, nil, errors.New("tls: X25519 value with wrong order")
+ }
+
+ hrssPublicKey, ok := hrss.ParsePublicKey(peerKey[32:])
+ if !ok {
+ return nil, nil, errors.New("tls: bad CECPQ2 offer")
+ }
+
+ hrssCiphertext, hrssShared := hrssPublicKey.Encap(rand)
+
+ publicKey = append(publicKey, x25519Public[:]...)
+ publicKey = append(publicKey, hrssCiphertext...)
+ preMasterSecret = append(preMasterSecret, x25519Shared[:]...)
+ preMasterSecret = append(preMasterSecret, hrssShared...)
+
+ return publicKey, preMasterSecret, nil
+}
+
+func (e *cecpq2Curve) finish(peerKey []byte) (preMasterSecret []byte, err error) {
+ if len(peerKey) != 32+hrss.CiphertextSize {
+ return nil, errors.New("tls: bad length CECPQ2 reply")
+ }
+
+ var x25519Shared, x25519PeerKey [32]byte
+ copy(x25519PeerKey[:], peerKey)
+ curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey)
+
+ // Per RFC 7748, reject the all-zero value in constant time.
+ var zeros [32]byte
+ if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 {
+ return nil, errors.New("tls: X25519 value with wrong order")
+ }
+
+ hrssShared, ok := e.hrssPrivateKey.Decap(peerKey[32:])
+ if !ok {
+ return nil, errors.New("tls: invalid HRSS ciphertext")
+ }
+
+ preMasterSecret = append(preMasterSecret, x25519Shared[:]...)
+ preMasterSecret = append(preMasterSecret, hrssShared...)
+
+ return preMasterSecret, nil
+}
+
func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) {
switch id {
case CurveP224:
@@ -359,6 +444,8 @@ func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) {
return &ellipticECDHCurve{curve: elliptic.P521(), sendCompressed: config.Bugs.SendCompressedCoordinates}, true
case CurveX25519:
return &x25519ECDHCurve{setHighBit: config.Bugs.SetX25519HighBit}, true
+ case CurveCECPQ2:
+ return &cecpq2Curve{}, true
default:
return nil, false
}
@@ -501,12 +588,17 @@ type ecdheKeyAgreement struct {
peerKey []byte
}
-func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
var curveid CurveID
preferredCurves := config.curvePreferences()
NextCandidate:
for _, candidate := range preferredCurves {
+ if candidate == CurveCECPQ2 && version < VersionTLS13 {
+ // CECPQ2 is TLS 1.3-only.
+ continue
+ }
+
for _, c := range clientHello.supportedCurves {
if candidate == c {
curveid = c
@@ -614,7 +706,7 @@ func (ka *ecdheKeyAgreement) peerSignatureAlgorithm() signatureAlgorithm {
// exchange.
type nilKeyAgreement struct{}
-func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
return nil, nil
}
@@ -666,7 +758,7 @@ type pskKeyAgreement struct {
identityHint string
}
-func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
// Assemble the identity hint.
bytes := make([]byte, 2+len(config.PreSharedKeyIdentity))
bytes[0] = byte(len(config.PreSharedKeyIdentity) >> 8)
@@ -675,7 +767,7 @@ func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certi
// If there is one, append the base key agreement's
// ServerKeyExchange.
- baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello)
+ baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello, version)
if err != nil {
return nil, err
}
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index fadc890f..b5cc0a79 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -22,6 +22,7 @@ import (
"crypto/x509"
"crypto/x509/pkix"
"encoding/base64"
+ "encoding/binary"
"encoding/hex"
"encoding/json"
"encoding/pem"
@@ -490,6 +491,9 @@ type testCase struct {
// expectedQUICTransportParams contains the QUIC transport
// parameters that are expected to be sent by the peer.
expectedQUICTransportParams []byte
+ // exportTrafficSecrets, if true, configures the test to export the TLS 1.3
+ // traffic secrets and confirms that they match.
+ exportTrafficSecrets bool
}
var testCases []testCase
@@ -768,6 +772,32 @@ func doExchange(test *testCase, config *Config, conn net.Conn, isResume bool, tr
}
}
+ if test.exportTrafficSecrets {
+ secretLenBytes := make([]byte, 2)
+ if _, err := io.ReadFull(tlsConn, secretLenBytes); err != nil {
+ return err
+ }
+ secretLen := binary.LittleEndian.Uint16(secretLenBytes)
+
+ theirReadSecret := make([]byte, secretLen)
+ theirWriteSecret := make([]byte, secretLen)
+ if _, err := io.ReadFull(tlsConn, theirReadSecret); err != nil {
+ return err
+ }
+ if _, err := io.ReadFull(tlsConn, theirWriteSecret); err != nil {
+ return err
+ }
+
+ myReadSecret := tlsConn.in.trafficSecret
+ myWriteSecret := tlsConn.out.trafficSecret
+ if !bytes.Equal(myWriteSecret, theirReadSecret) {
+ return fmt.Errorf("read traffic-secret mismatch; got %x, wanted %x", theirReadSecret, myWriteSecret)
+ }
+ if !bytes.Equal(myReadSecret, theirWriteSecret) {
+ return fmt.Errorf("write traffic-secret mismatch; got %x, wanted %x", theirWriteSecret, myReadSecret)
+ }
+ }
+
if test.testTLSUnique {
var peersValue [12]byte
if _, err := io.ReadFull(tlsConn, peersValue[:]); err != nil {
@@ -1123,6 +1153,10 @@ func runTest(test *testCase, shimPath string, mallocNumToFail int64) error {
flags = append(flags, "-export-context", test.exportContext)
}
+ if test.exportTrafficSecrets {
+ flags = append(flags, "-export-traffic-secrets")
+ }
+
if test.expectResumeRejected {
flags = append(flags, "-expect-session-miss")
}
@@ -8862,7 +8896,7 @@ func addSignatureAlgorithmTests() {
// Not all ciphers involve a signature. Advertise a list which gives all
// versions a signing cipher.
signingCiphers := []uint16{
- TLS_AES_128_GCM_SHA256,
+ TLS_AES_256_GCM_SHA384,
TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,
@@ -9306,13 +9340,13 @@ func addSignatureAlgorithmTests() {
expectedError: ":WRONG_SIGNATURE_TYPE:",
})
- // Test that, if the list is missing, the peer falls back to SHA-1 in
- // TLS 1.2, but not TLS 1.3.
+ // Test that, if the ClientHello list is missing, the server falls back
+ // to SHA-1 in TLS 1.2, but not TLS 1.3.
testCases = append(testCases, testCase{
- name: "ClientAuth-SHA1-Fallback-RSA",
+ testType: serverTest,
+ name: "ServerAuth-SHA1-Fallback-RSA",
config: Config{
MaxVersion: VersionTLS12,
- ClientAuth: RequireAnyClientCert,
VerifySignatureAlgorithms: []signatureAlgorithm{
signatureRSAPKCS1WithSHA1,
},
@@ -9328,86 +9362,87 @@ func addSignatureAlgorithmTests() {
testCases = append(testCases, testCase{
testType: serverTest,
- name: "ServerAuth-SHA1-Fallback-RSA",
+ name: "ServerAuth-SHA1-Fallback-ECDSA",
config: Config{
MaxVersion: VersionTLS12,
VerifySignatureAlgorithms: []signatureAlgorithm{
- signatureRSAPKCS1WithSHA1,
+ signatureECDSAWithSHA1,
},
Bugs: ProtocolBugs{
NoSignatureAlgorithms: true,
},
},
flags: []string{
- "-cert-file", path.Join(*resourceDir, rsaCertificateFile),
- "-key-file", path.Join(*resourceDir, rsaKeyFile),
+ "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile),
+ "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile),
},
})
testCases = append(testCases, testCase{
- name: "ClientAuth-SHA1-Fallback-ECDSA",
+ testType: serverTest,
+ name: "ServerAuth-NoFallback-TLS13",
config: Config{
- MaxVersion: VersionTLS12,
- ClientAuth: RequireAnyClientCert,
+ MaxVersion: VersionTLS13,
VerifySignatureAlgorithms: []signatureAlgorithm{
- signatureECDSAWithSHA1,
+ signatureRSAPKCS1WithSHA1,
},
Bugs: ProtocolBugs{
NoSignatureAlgorithms: true,
},
},
- flags: []string{
- "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile),
- "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile),
- },
+ shouldFail: true,
+ expectedError: ":NO_COMMON_SIGNATURE_ALGORITHMS:",
})
+ // The CertificateRequest list, however, may never be omitted. It is a
+ // syntax error for it to be empty.
testCases = append(testCases, testCase{
- testType: serverTest,
- name: "ServerAuth-SHA1-Fallback-ECDSA",
+ name: "ClientAuth-NoFallback-RSA",
config: Config{
MaxVersion: VersionTLS12,
+ ClientAuth: RequireAnyClientCert,
VerifySignatureAlgorithms: []signatureAlgorithm{
- signatureECDSAWithSHA1,
+ signatureRSAPKCS1WithSHA1,
},
Bugs: ProtocolBugs{
NoSignatureAlgorithms: true,
},
},
flags: []string{
- "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile),
- "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile),
+ "-cert-file", path.Join(*resourceDir, rsaCertificateFile),
+ "-key-file", path.Join(*resourceDir, rsaKeyFile),
},
+ shouldFail: true,
+ expectedError: ":DECODE_ERROR:",
+ expectedLocalError: "remote error: error decoding message",
})
testCases = append(testCases, testCase{
- name: "ClientAuth-NoFallback-TLS13",
+ name: "ClientAuth-NoFallback-ECDSA",
config: Config{
- MaxVersion: VersionTLS13,
+ MaxVersion: VersionTLS12,
ClientAuth: RequireAnyClientCert,
VerifySignatureAlgorithms: []signatureAlgorithm{
- signatureRSAPKCS1WithSHA1,
+ signatureECDSAWithSHA1,
},
Bugs: ProtocolBugs{
NoSignatureAlgorithms: true,
},
},
flags: []string{
- "-cert-file", path.Join(*resourceDir, rsaCertificateFile),
- "-key-file", path.Join(*resourceDir, rsaKeyFile),
+ "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile),
+ "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile),
},
- shouldFail: true,
- // An empty CertificateRequest signature algorithm list is a
- // syntax error in TLS 1.3.
+ shouldFail: true,
expectedError: ":DECODE_ERROR:",
expectedLocalError: "remote error: error decoding message",
})
testCases = append(testCases, testCase{
- testType: serverTest,
- name: "ServerAuth-NoFallback-TLS13",
+ name: "ClientAuth-NoFallback-TLS13",
config: Config{
MaxVersion: VersionTLS13,
+ ClientAuth: RequireAnyClientCert,
VerifySignatureAlgorithms: []signatureAlgorithm{
signatureRSAPKCS1WithSHA1,
},
@@ -9415,8 +9450,13 @@ func addSignatureAlgorithmTests() {
NoSignatureAlgorithms: true,
},
},
- shouldFail: true,
- expectedError: ":NO_COMMON_SIGNATURE_ALGORITHMS:",
+ flags: []string{
+ "-cert-file", path.Join(*resourceDir, rsaCertificateFile),
+ "-key-file", path.Join(*resourceDir, rsaKeyFile),
+ },
+ shouldFail: true,
+ expectedError: ":DECODE_ERROR:",
+ expectedLocalError: "remote error: error decoding message",
})
// Test that signature preferences are enforced. BoringSSL does not
@@ -9613,7 +9653,7 @@ func addSignatureAlgorithmTests() {
CipherSuites: []uint16{TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256},
Certificates: []Certificate{ecdsaP256Certificate},
},
- flags: []string{"-p384-only"},
+ flags: []string{"-curves", strconv.Itoa(int(CurveP384))},
shouldFail: true,
expectedError: ":BAD_ECC_CERT:",
})
@@ -9625,7 +9665,7 @@ func addSignatureAlgorithmTests() {
MaxVersion: VersionTLS13,
Certificates: []Certificate{ecdsaP256Certificate},
},
- flags: []string{"-p384-only"},
+ flags: []string{"-curves", strconv.Itoa(int(CurveP384))},
})
// In TLS 1.2, the ECDSA curve is not in the signature algorithm.
@@ -10515,6 +10555,24 @@ func addExportKeyingMaterialTests() {
})
}
+func addExportTrafficSecretsTests() {
+ for _, cipherSuite := range []testCipherSuite{
+ // Test a SHA-256 and SHA-384 based cipher suite.
+ {"AEAD-AES128-GCM-SHA256", TLS_AES_128_GCM_SHA256},
+ {"AEAD-AES256-GCM-SHA384", TLS_AES_256_GCM_SHA384},
+ } {
+
+ testCases = append(testCases, testCase{
+ name: "ExportTrafficSecrets-" + cipherSuite.name,
+ config: Config{
+ MinVersion: VersionTLS13,
+ CipherSuites: []uint16{cipherSuite.id},
+ },
+ exportTrafficSecrets: true,
+ })
+ }
+}
+
func addTLSUniqueTests() {
for _, isClient := range []bool{false, true} {
for _, isResumption := range []bool{false, true} {
@@ -10705,6 +10763,7 @@ var testCurves = []struct {
{"P-384", CurveP384},
{"P-521", CurveP521},
{"X25519", CurveX25519},
+ {"CECPQ2", CurveCECPQ2},
}
const bogusCurve = 0x1234
@@ -10712,6 +10771,10 @@ const bogusCurve = 0x1234
func addCurveTests() {
for _, curve := range testCurves {
for _, ver := range tlsVersions {
+ if curve.id == CurveCECPQ2 && ver.version < VersionTLS13 {
+ continue
+ }
+
suffix := curve.name + "-" + ver.name
testCases = append(testCases, testCase{
@@ -10721,7 +10784,7 @@ func addCurveTests() {
CipherSuites: []uint16{
TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
- TLS_AES_128_GCM_SHA256,
+ TLS_AES_256_GCM_SHA384,
},
CurvePreferences: []CurveID{curve.id},
},
@@ -10740,7 +10803,7 @@ func addCurveTests() {
CipherSuites: []uint16{
TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
- TLS_AES_128_GCM_SHA256,
+ TLS_AES_256_GCM_SHA384,
},
CurvePreferences: []CurveID{curve.id},
},
@@ -10752,7 +10815,7 @@ func addCurveTests() {
expectedCurveID: curve.id,
})
- if curve.id != CurveX25519 {
+ if curve.id != CurveX25519 && curve.id != CurveCECPQ2 {
testCases = append(testCases, testCase{
name: "CurveTest-Client-Compressed-" + suffix,
config: Config{
@@ -10760,7 +10823,7 @@ func addCurveTests() {
CipherSuites: []uint16{
TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
- TLS_AES_128_GCM_SHA256,
+ TLS_AES_256_GCM_SHA384,
},
CurvePreferences: []CurveID{curve.id},
Bugs: ProtocolBugs{
@@ -10780,7 +10843,7 @@ func addCurveTests() {
CipherSuites: []uint16{
TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
- TLS_AES_128_GCM_SHA256,
+ TLS_AES_256_GCM_SHA384,
},
CurvePreferences: []CurveID{curve.id},
Bugs: ProtocolBugs{
@@ -10896,7 +10959,7 @@ func addCurveTests() {
IgnorePeerCurvePreferences: true,
},
},
- flags: []string{"-p384-only"},
+ flags: []string{"-curves", strconv.Itoa(int(CurveP384))},
shouldFail: true,
expectedError: ":WRONG_CURVE:",
})
@@ -10912,7 +10975,7 @@ func addCurveTests() {
SendCurve: CurveP256,
},
},
- flags: []string{"-p384-only"},
+ flags: []string{"-curves", strconv.Itoa(int(CurveP384))},
shouldFail: true,
expectedError: ":WRONG_CURVE:",
})
@@ -11163,6 +11226,112 @@ func addCurveTests() {
},
},
})
+
+ // CECPQ2 should not be offered by a TLS < 1.3 client.
+ testCases = append(testCases, testCase{
+ name: "CECPQ2NotInTLS12",
+ config: Config{
+ Bugs: ProtocolBugs{
+ FailIfCECPQ2Offered: true,
+ },
+ },
+ flags: []string{
+ "-max-version", strconv.Itoa(VersionTLS12),
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-curves", strconv.Itoa(int(CurveX25519)),
+ },
+ })
+
+ // CECPQ2 should not crash a TLS < 1.3 client if the server mistakenly
+ // selects it.
+ testCases = append(testCases, testCase{
+ name: "CECPQ2NotAcceptedByTLS12Client",
+ config: Config{
+ Bugs: ProtocolBugs{
+ SendCurve: CurveCECPQ2,
+ },
+ },
+ flags: []string{
+ "-max-version", strconv.Itoa(VersionTLS12),
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-curves", strconv.Itoa(int(CurveX25519)),
+ },
+ shouldFail: true,
+ expectedError: ":WRONG_CURVE:",
+ })
+
+ // CECPQ2 should not be offered by default as a client.
+ testCases = append(testCases, testCase{
+ name: "CECPQ2NotEnabledByDefaultInClients",
+ config: Config{
+ MinVersion: VersionTLS13,
+ Bugs: ProtocolBugs{
+ FailIfCECPQ2Offered: true,
+ },
+ },
+ })
+
+ // If CECPQ2 is offered, both X25519 and CECPQ2 should have a key-share.
+ testCases = append(testCases, testCase{
+ name: "NotJustCECPQ2KeyShare",
+ config: Config{
+ MinVersion: VersionTLS13,
+ Bugs: ProtocolBugs{
+ ExpectedKeyShares: []CurveID{CurveCECPQ2, CurveX25519},
+ },
+ },
+ flags: []string{
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-curves", strconv.Itoa(int(CurveX25519)),
+ "-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)),
+ },
+ })
+
+ // ... but only if CECPQ2 is listed first.
+ testCases = append(testCases, testCase{
+ name: "CECPQ2KeyShareNotIncludedSecond",
+ config: Config{
+ MinVersion: VersionTLS13,
+ Bugs: ProtocolBugs{
+ ExpectedKeyShares: []CurveID{CurveX25519},
+ },
+ },
+ flags: []string{
+ "-curves", strconv.Itoa(int(CurveX25519)),
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-expect-curve-id", strconv.Itoa(int(CurveX25519)),
+ },
+ })
+
+ // If CECPQ2 is the only configured curve, the key share is sent.
+ testCases = append(testCases, testCase{
+ name: "JustConfiguringCECPQ2Works",
+ config: Config{
+ MinVersion: VersionTLS13,
+ Bugs: ProtocolBugs{
+ ExpectedKeyShares: []CurveID{CurveCECPQ2},
+ },
+ },
+ flags: []string{
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)),
+ },
+ })
+
+ // As a server, CECPQ2 is not yet supported by default.
+ testCases = append(testCases, testCase{
+ testType: serverTest,
+ name: "CECPQ2NotEnabledByDefaultForAServer",
+ config: Config{
+ MinVersion: VersionTLS13,
+ CurvePreferences: []CurveID{CurveCECPQ2, CurveX25519},
+ DefaultCurves: []CurveID{CurveCECPQ2},
+ },
+ flags: []string{
+ "-server-preference",
+ "-expect-curve-id", strconv.Itoa(int(CurveX25519)),
+ },
+ })
}
func addTLS13RecordTests() {
@@ -12700,7 +12869,7 @@ func addTLS13HandshakeTests() {
},
},
tls13Variant: variant,
- flags: []string{"-p384-only"},
+ flags: []string{"-curves", strconv.Itoa(int(CurveP384))},
shouldFail: true,
expectedError: ":WRONG_CURVE:",
})
@@ -13859,6 +14028,60 @@ func addTLS13CipherPreferenceTests() {
"-expect-cipher-no-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)),
},
})
+
+ // Test that CECPQ2 cannot be used with TLS_AES_128_GCM_SHA256.
+ testCases = append(testCases, testCase{
+ testType: serverTest,
+ name: "TLS13-CipherPreference-CECPQ2-AES128Only",
+ config: Config{
+ MaxVersion: VersionTLS13,
+ CipherSuites: []uint16{
+ TLS_AES_128_GCM_SHA256,
+ },
+ },
+ flags: []string{
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ },
+ shouldFail: true,
+ expectedError: ":NO_SHARED_CIPHER:",
+ expectedLocalError: "remote error: handshake failure",
+ })
+
+ // Test that CECPQ2 continues to honor AES vs ChaCha20 logic.
+ testCases = append(testCases, testCase{
+ testType: serverTest,
+ name: "TLS13-CipherPreference-CECPQ2-AES128-ChaCha20-AES256",
+ config: Config{
+ MaxVersion: VersionTLS13,
+ CipherSuites: []uint16{
+ TLS_AES_128_GCM_SHA256,
+ TLS_CHACHA20_POLY1305_SHA256,
+ TLS_AES_256_GCM_SHA384,
+ },
+ },
+ flags: []string{
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-expect-cipher-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)),
+ "-expect-cipher-no-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)),
+ },
+ })
+ testCases = append(testCases, testCase{
+ testType: serverTest,
+ name: "TLS13-CipherPreference-CECPQ2-AES128-AES256-ChaCha20",
+ config: Config{
+ MaxVersion: VersionTLS13,
+ CipherSuites: []uint16{
+ TLS_AES_128_GCM_SHA256,
+ TLS_AES_256_GCM_SHA384,
+ TLS_CHACHA20_POLY1305_SHA256,
+ },
+ },
+ flags: []string{
+ "-curves", strconv.Itoa(int(CurveCECPQ2)),
+ "-expect-cipher-aes", strconv.Itoa(int(TLS_AES_256_GCM_SHA384)),
+ "-expect-cipher-no-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)),
+ },
+ })
}
func addPeekTests() {
@@ -14680,7 +14903,7 @@ func addJDK11WorkaroundTests() {
},
{
// The above with a padding extension added at the end.
- decodeHexOrPanic("010001b4030336a379aa355a22a064b4402760efae1c73977b0b4c975efc7654c35677723dde201fe3f8a2bca60418a68f72463ea19f3c241e7cbfceb347e451a62bd2417d8981005a13011302c02cc02bc030009dc02ec032009f00a3c02f009cc02dc031009e00a2c024c028003dc026c02a006b006ac00ac0140035c005c00f00390038c023c027003cc025c02900670040c009c013002fc004c00e0033003200ff01000111000000080006000003736e69000500050100000000000a0020001e0017001800190009000a000b000c000d000e001601000101010201030104000b00020100000d002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020032002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020011000900070200040000000000170000002b0009080304030303020301002d000201010033004700450017004104721f007464cb08a0f36e093ad178eb78d6968df20077b2dd882694a85dc4c9884caf5092db41f16cc3f8d41f59426992fa5e32cfb9ad08deee752cdd95b1a6b50015000770616464696e67"),
+ decodeHexOrPanic("010001b4030336a379aa355a22a064b4402760efae1c73977b0b4c975efc7654c35677723dde201fe3f8a2bca60418a68f72463ea19f3c241e7cbfceb347e451a62bd2417d8981005a13011302c02cc02bc030009dc02ec032009f00a3c02f009cc02dc031009e00a2c024c028003dc026c02a006b006ac00ac0140035c005c00f00390038c023c027003cc025c02900670040c009c013002fc004c00e0033003200ff01000111000000080006000003736e69000500050100000000000a0020001e0017001800190009000a000b000c000d000e001601000101010201030104000b00020100000d002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020032002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020011000900070200040000000000170000002b0009080304030303020301002d000201010033004700450017004104721f007464cb08a0f36e093ad178eb78d6968df20077b2dd882694a85dc4c9884caf5092db41f16cc3f8d41f59426992fa5e32cfb9ad08deee752cdd95b1a6b50015000700000000000000"),
false,
},
{
@@ -14905,6 +15128,7 @@ func main() {
addSignatureAlgorithmTests()
addDTLSRetransmitTests()
addExportKeyingMaterialTests()
+ addExportTrafficSecretsTests()
addTLSUniqueTests()
addCustomExtensionTests()
addRSAClientKeyExchangeTests()
diff --git a/src/ssl/test/test_config.cc b/src/ssl/test/test_config.cc
index 7447d5ad..bed05010 100644
--- a/src/ssl/test/test_config.cc
+++ b/src/ssl/test/test_config.cc
@@ -104,7 +104,6 @@ const Flag<bool> kBoolFlags[] = {
{ "-renegotiate-ignore", &TestConfig::renegotiate_ignore },
{ "-forbid-renegotiation-after-handshake",
&TestConfig::forbid_renegotiation_after_handshake },
- { "-p384-only", &TestConfig::p384_only },
{ "-enable-all-curves", &TestConfig::enable_all_curves },
{ "-use-old-client-cert-callback",
&TestConfig::use_old_client_cert_callback },
@@ -147,6 +146,8 @@ const Flag<bool> kBoolFlags[] = {
{ "-handshaker-resume", &TestConfig::handshaker_resume },
{ "-reverify-on-resume", &TestConfig::reverify_on_resume },
{ "-jdk11-workaround", &TestConfig::jdk11_workaround },
+ { "-server-preference", &TestConfig::server_preference },
+ { "-export-traffic-secrets", &TestConfig::export_traffic_secrets },
};
const Flag<std::string> kStringFlags[] = {
@@ -220,10 +221,10 @@ const Flag<int> kIntFlags[] = {
};
const Flag<std::vector<int>> kIntVectorFlags[] = {
- { "-signing-prefs", &TestConfig::signing_prefs },
- { "-verify-prefs", &TestConfig::verify_prefs },
- { "-expect-peer-verify-pref",
- &TestConfig::expected_peer_verify_prefs },
+ {"-signing-prefs", &TestConfig::signing_prefs},
+ {"-verify-prefs", &TestConfig::verify_prefs},
+ {"-expect-peer-verify-pref", &TestConfig::expected_peer_verify_prefs},
+ {"-curves", &TestConfig::curves},
};
bool ParseFlag(char *flag, int argc, char **argv, int *i,
@@ -1294,7 +1295,6 @@ bssl::UniquePtr<SSL_CTX> TestConfig::SetupCtx(SSL_CTX *old_ctx) const {
return nullptr;
}
-
if (install_cert_compression_algs &&
(!SSL_CTX_add_cert_compression_alg(
ssl_ctx.get(), 0xff02,
@@ -1341,6 +1341,10 @@ bssl::UniquePtr<SSL_CTX> TestConfig::SetupCtx(SSL_CTX *old_ctx) const {
abort();
}
+ if (server_preference) {
+ SSL_CTX_set_options(ssl_ctx.get(), SSL_OP_CIPHER_SERVER_PREFERENCE);
+ }
+
return ssl_ctx;
}
@@ -1589,16 +1593,43 @@ bssl::UniquePtr<SSL> TestConfig::NewSSL(
if (!check_close_notify) {
SSL_set_quiet_shutdown(ssl.get(), 1);
}
- if (p384_only) {
- int nid = NID_secp384r1;
- if (!SSL_set1_curves(ssl.get(), &nid, 1)) {
- return nullptr;
+ if (!curves.empty()) {
+ std::vector<int> nids;
+ for (auto curve : curves) {
+ switch (curve) {
+ case SSL_CURVE_SECP224R1:
+ nids.push_back(NID_secp224r1);
+ break;
+
+ case SSL_CURVE_SECP256R1:
+ nids.push_back(NID_X9_62_prime256v1);
+ break;
+
+ case SSL_CURVE_SECP384R1:
+ nids.push_back(NID_secp384r1);
+ break;
+
+ case SSL_CURVE_SECP521R1:
+ nids.push_back(NID_secp521r1);
+ break;
+
+ case SSL_CURVE_X25519:
+ nids.push_back(NID_X25519);
+ break;
+
+ case SSL_CURVE_CECPQ2:
+ nids.push_back(NID_CECPQ2);
+ break;
+ }
+ if (!SSL_set1_curves(ssl.get(), &nids[0], nids.size())) {
+ return nullptr;
+ }
}
}
if (enable_all_curves) {
static const int kAllCurves[] = {
NID_secp224r1, NID_X9_62_prime256v1, NID_secp384r1,
- NID_secp521r1, NID_X25519,
+ NID_secp521r1, NID_X25519, NID_CECPQ2,
};
if (!SSL_set1_curves(ssl.get(), kAllCurves,
OPENSSL_ARRAY_SIZE(kAllCurves))) {
diff --git a/src/ssl/test/test_config.h b/src/ssl/test/test_config.h
index bffe9118..0d0753e8 100644
--- a/src/ssl/test/test_config.h
+++ b/src/ssl/test/test_config.h
@@ -33,6 +33,7 @@ struct TestConfig {
std::vector<int> signing_prefs;
std::vector<int> verify_prefs;
std::vector<int> expected_peer_verify_prefs;
+ std::vector<int> curves;
std::string key_file;
std::string cert_file;
std::string expected_server_name;
@@ -122,7 +123,6 @@ struct TestConfig {
bool renegotiate_ignore = false;
bool forbid_renegotiation_after_handshake = false;
int expect_peer_signature_algorithm = 0;
- bool p384_only = false;
bool enable_all_curves = false;
int expect_curve_id = 0;
bool use_old_client_cert_callback = false;
@@ -170,6 +170,8 @@ struct TestConfig {
bool handshaker_resume = false;
std::string handshaker_path;
bool jdk11_workaround = false;
+ bool server_preference = false;
+ bool export_traffic_secrets = false;
int argc;
char **argv;
diff --git a/src/ssl/tls13_client.cc b/src/ssl/tls13_client.cc
index 0d3e8771..40913dcf 100644
--- a/src/ssl/tls13_client.cc
+++ b/src/ssl/tls13_client.cc
@@ -165,15 +165,17 @@ static enum ssl_hs_wait_t do_read_hello_retry_request(SSL_HANDSHAKE *hs) {
return ssl_hs_error;
}
- // Check that the HelloRetryRequest does not request the key share that
- // was provided in the initial ClientHello.
- if (hs->key_share->GroupID() == group_id) {
+ // Check that the HelloRetryRequest does not request a key share that was
+ // provided in the initial ClientHello.
+ if (hs->key_shares[0]->GroupID() == group_id ||
+ (hs->key_shares[1] && hs->key_shares[1]->GroupID() == group_id)) {
ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER);
OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE);
return ssl_hs_error;
}
- hs->key_share.reset();
+ hs->key_shares[0].reset();
+ hs->key_shares[1].reset();
hs->retry_group = group_id;
}
@@ -506,7 +508,6 @@ static enum ssl_hs_wait_t do_read_certificate_request(SSL_HANDSHAKE *hs) {
!have_sigalgs ||
!CBS_get_u16_length_prefixed(&sigalgs,
&supported_signature_algorithms) ||
- CBS_len(&supported_signature_algorithms) == 0 ||
!tls1_parse_peer_sigalgs(hs, &supported_signature_algorithms)) {
ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
diff --git a/src/ssl/tls13_server.cc b/src/ssl/tls13_server.cc
index b4c4ca5a..7073b575 100644
--- a/src/ssl/tls13_server.cc
+++ b/src/ssl/tls13_server.cc
@@ -96,33 +96,39 @@ static int ssl_ext_supported_versions_add_serverhello(SSL_HANDSHAKE *hs,
}
static const SSL_CIPHER *choose_tls13_cipher(
- const SSL *ssl, const SSL_CLIENT_HELLO *client_hello) {
+ const SSL *ssl, const SSL_CLIENT_HELLO *client_hello, uint16_t group_id) {
if (client_hello->cipher_suites_len % 2 != 0) {
- return NULL;
+ return nullptr;
}
CBS cipher_suites;
CBS_init(&cipher_suites, client_hello->cipher_suites,
client_hello->cipher_suites_len);
- const int aes_is_fine = EVP_has_aes_hardware();
+ const bool aes_is_fine = EVP_has_aes_hardware();
+ const bool require_256_bit = group_id == SSL_CURVE_CECPQ2;
const uint16_t version = ssl_protocol_version(ssl);
- const SSL_CIPHER *best = NULL;
+ const SSL_CIPHER *best = nullptr;
while (CBS_len(&cipher_suites) > 0) {
uint16_t cipher_suite;
if (!CBS_get_u16(&cipher_suites, &cipher_suite)) {
- return NULL;
+ return nullptr;
}
// Limit to TLS 1.3 ciphers we know about.
const SSL_CIPHER *candidate = SSL_get_cipher_by_value(cipher_suite);
- if (candidate == NULL ||
+ if (candidate == nullptr ||
SSL_CIPHER_get_min_version(candidate) > version ||
SSL_CIPHER_get_max_version(candidate) < version) {
continue;
}
+ // Post-quantum key exchanges should be paired with 256-bit ciphers.
+ if (require_256_bit && candidate->algorithm_enc == SSL_AES128GCM) {
+ continue;
+ }
+
// TLS 1.3 removes legacy ciphers, so honor the client order, but prefer
// ChaCha20 if we do not have AES hardware.
if (aes_is_fine) {
@@ -133,7 +139,7 @@ static const SSL_CIPHER *choose_tls13_cipher(
return candidate;
}
- if (best == NULL) {
+ if (best == nullptr) {
best = candidate;
}
}
@@ -240,8 +246,15 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) {
client_hello.session_id_len);
hs->session_id_len = client_hello.session_id_len;
+ uint16_t group_id;
+ if (!tls1_get_shared_group(hs, &group_id)) {
+ OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_GROUP);
+ ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
+ return ssl_hs_error;
+ }
+
// Negotiate the cipher suite.
- hs->new_cipher = choose_tls13_cipher(ssl, &client_hello);
+ hs->new_cipher = choose_tls13_cipher(ssl, &client_hello, group_id);
if (hs->new_cipher == NULL) {
OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER);
ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
diff --git a/src/tool/speed.cc b/src/tool/speed.cc
index 2175baa2..975fb531 100644
--- a/src/tool/speed.cc
+++ b/src/tool/speed.cc
@@ -32,6 +32,7 @@
#include <openssl/ecdsa.h>
#include <openssl/ec_key.h>
#include <openssl/evp.h>
+#include <openssl/hrss.h>
#include <openssl/nid.h>
#include <openssl/rand.h>
#include <openssl/rsa.h>
@@ -744,6 +745,61 @@ static bool SpeedScrypt(const std::string &selected) {
return true;
}
+static bool SpeedHRSS(const std::string &selected) {
+ if (!selected.empty() && selected != "HRSS") {
+ return true;
+ }
+
+ TimeResults results;
+
+ if (!TimeFunction(&results, []() -> bool {
+ struct HRSS_public_key pub;
+ struct HRSS_private_key priv;
+ uint8_t entropy[HRSS_GENERATE_KEY_BYTES];
+ RAND_bytes(entropy, sizeof(entropy));
+ HRSS_generate_key(&pub, &priv, entropy);
+ return true;
+ })) {
+ fprintf(stderr, "Failed to time HRSS_generate_key.\n");
+ return false;
+ }
+
+ results.Print("HRSS generate");
+
+ struct HRSS_public_key pub;
+ struct HRSS_private_key priv;
+ uint8_t key_entropy[HRSS_GENERATE_KEY_BYTES];
+ RAND_bytes(key_entropy, sizeof(key_entropy));
+ HRSS_generate_key(&pub, &priv, key_entropy);
+
+ uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+ if (!TimeFunction(&results, [&pub, &ciphertext]() -> bool {
+ uint8_t entropy[HRSS_ENCAP_BYTES];
+ uint8_t shared_key[HRSS_KEY_BYTES];
+ RAND_bytes(entropy, sizeof(entropy));
+ HRSS_encap(ciphertext, shared_key, &pub, entropy);
+ return true;
+ })) {
+ fprintf(stderr, "Failed to time HRSS_encap.\n");
+ return false;
+ }
+
+ results.Print("HRSS encap");
+
+ if (!TimeFunction(&results, [&pub, &priv, &ciphertext]() -> bool {
+ uint8_t shared_key[HRSS_KEY_BYTES];
+ HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext));
+ return true;
+ })) {
+ fprintf(stderr, "Failed to time HRSS_encap.\n");
+ return false;
+ }
+
+ results.Print("HRSS decap");
+
+ return true;
+}
+
static const struct argument kArguments[] = {
{
"-filter", kOptionalArgument,
@@ -817,7 +873,8 @@ bool Speed(const std::vector<std::string> &args) {
!Speed25519(selected) ||
!SpeedSPAKE2(selected) ||
!SpeedScrypt(selected) ||
- !SpeedRSAKeyGen(selected)) {
+ !SpeedRSAKeyGen(selected) ||
+ !SpeedHRSS(selected)) {
return false;
}
diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py
index 9c635dcf..2a6fe3f2 100644
--- a/src/util/generate_build_files.py
+++ b/src/util/generate_build_files.py
@@ -44,6 +44,9 @@ NON_PERL_FILES = {
'src/crypto/curve25519/asm/x25519-asm-arm.S',
'src/crypto/poly1305/poly1305_arm_asm.S',
],
+ ('linux', 'x86_64'): [
+ 'src/crypto/hrss/asm/poly_rq_mul.S',
+ ],
}
PREFIX = None
diff --git a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
index 7dc0c5ac..923c9fa9 100644
--- a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
+++ b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm
@@ -1609,1020 +1609,6 @@ $L$ctr_enc_epilogue:
DB 0F3h,0C3h ;repret
-global bsaes_xts_encrypt
-
-ALIGN 16
-bsaes_xts_encrypt:
-
- mov rax,rsp
-$L$xts_enc_prologue:
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-72))+rsp]
-
- mov r10,QWORD[160+rsp]
- mov r11,QWORD[168+rsp]
- lea rsp,[((-160))+rsp]
- movaps XMMWORD[64+rsp],xmm6
- movaps XMMWORD[80+rsp],xmm7
- movaps XMMWORD[96+rsp],xmm8
- movaps XMMWORD[112+rsp],xmm9
- movaps XMMWORD[128+rsp],xmm10
- movaps XMMWORD[144+rsp],xmm11
- movaps XMMWORD[160+rsp],xmm12
- movaps XMMWORD[176+rsp],xmm13
- movaps XMMWORD[192+rsp],xmm14
- movaps XMMWORD[208+rsp],xmm15
-$L$xts_enc_body:
- mov rbp,rsp
-
- mov r12,rcx
- mov r13,rdx
- mov r14,r8
- mov r15,r9
-
- lea rcx,[r11]
- lea rdx,[32+rbp]
- lea r8,[r10]
- call aes_nohw_encrypt
-
- mov eax,DWORD[240+r15]
- mov rbx,r14
-
- mov edx,eax
- shl rax,7
- sub rax,96
- sub rsp,rax
-
- mov rax,rsp
- mov rcx,r15
- mov r10d,edx
- call _bsaes_key_convert
- pxor xmm7,xmm6
- movdqa XMMWORD[rax],xmm7
-
- and r14,-16
- sub rsp,0x80
- movdqa xmm6,XMMWORD[32+rbp]
-
- pxor xmm14,xmm14
- movdqa xmm12,XMMWORD[$L$xts_magic]
- pcmpgtd xmm14,xmm6
-
- sub r14,0x80
- jc NEAR $L$xts_enc_short
- jmp NEAR $L$xts_enc_loop
-
-ALIGN 16
-$L$xts_enc_loop:
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm15,xmm6
- movdqa XMMWORD[rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm0,xmm6
- movdqa XMMWORD[16+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm7,XMMWORD[r12]
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm1,xmm6
- movdqa XMMWORD[32+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm8,XMMWORD[16+r12]
- pxor xmm15,xmm7
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm2,xmm6
- movdqa XMMWORD[48+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm9,XMMWORD[32+r12]
- pxor xmm0,xmm8
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm3,xmm6
- movdqa XMMWORD[64+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm10,XMMWORD[48+r12]
- pxor xmm1,xmm9
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm4,xmm6
- movdqa XMMWORD[80+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm11,XMMWORD[64+r12]
- pxor xmm2,xmm10
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm5,xmm6
- movdqa XMMWORD[96+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm12,XMMWORD[80+r12]
- pxor xmm3,xmm11
- movdqu xmm13,XMMWORD[96+r12]
- pxor xmm4,xmm12
- movdqu xmm14,XMMWORD[112+r12]
- lea r12,[128+r12]
- movdqa XMMWORD[112+rsp],xmm6
- pxor xmm5,xmm13
- lea rax,[128+rsp]
- pxor xmm6,xmm14
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm3,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm5,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm3
- pxor xmm2,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm5
- pxor xmm6,XMMWORD[80+rsp]
- movdqu XMMWORD[64+r13],xmm2
- pxor xmm1,XMMWORD[96+rsp]
- movdqu XMMWORD[80+r13],xmm6
- pxor xmm4,XMMWORD[112+rsp]
- movdqu XMMWORD[96+r13],xmm1
- movdqu XMMWORD[112+r13],xmm4
- lea r13,[128+r13]
-
- movdqa xmm6,XMMWORD[112+rsp]
- pxor xmm14,xmm14
- movdqa xmm12,XMMWORD[$L$xts_magic]
- pcmpgtd xmm14,xmm6
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
-
- sub r14,0x80
- jnc NEAR $L$xts_enc_loop
-
-$L$xts_enc_short:
- add r14,0x80
- jz NEAR $L$xts_enc_done
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm15,xmm6
- movdqa XMMWORD[rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm0,xmm6
- movdqa XMMWORD[16+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm7,XMMWORD[r12]
- cmp r14,16
- je NEAR $L$xts_enc_1
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm1,xmm6
- movdqa XMMWORD[32+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm8,XMMWORD[16+r12]
- cmp r14,32
- je NEAR $L$xts_enc_2
- pxor xmm15,xmm7
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm2,xmm6
- movdqa XMMWORD[48+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm9,XMMWORD[32+r12]
- cmp r14,48
- je NEAR $L$xts_enc_3
- pxor xmm0,xmm8
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm3,xmm6
- movdqa XMMWORD[64+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm10,XMMWORD[48+r12]
- cmp r14,64
- je NEAR $L$xts_enc_4
- pxor xmm1,xmm9
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm4,xmm6
- movdqa XMMWORD[80+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm11,XMMWORD[64+r12]
- cmp r14,80
- je NEAR $L$xts_enc_5
- pxor xmm2,xmm10
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm5,xmm6
- movdqa XMMWORD[96+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm12,XMMWORD[80+r12]
- cmp r14,96
- je NEAR $L$xts_enc_6
- pxor xmm3,xmm11
- movdqu xmm13,XMMWORD[96+r12]
- pxor xmm4,xmm12
- movdqa XMMWORD[112+rsp],xmm6
- lea r12,[112+r12]
- pxor xmm5,xmm13
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm3,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm5,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm3
- pxor xmm2,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm5
- pxor xmm6,XMMWORD[80+rsp]
- movdqu XMMWORD[64+r13],xmm2
- pxor xmm1,XMMWORD[96+rsp]
- movdqu XMMWORD[80+r13],xmm6
- movdqu XMMWORD[96+r13],xmm1
- lea r13,[112+r13]
-
- movdqa xmm6,XMMWORD[112+rsp]
- jmp NEAR $L$xts_enc_done
-ALIGN 16
-$L$xts_enc_6:
- pxor xmm3,xmm11
- lea r12,[96+r12]
- pxor xmm4,xmm12
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm3,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm5,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm3
- pxor xmm2,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm5
- pxor xmm6,XMMWORD[80+rsp]
- movdqu XMMWORD[64+r13],xmm2
- movdqu XMMWORD[80+r13],xmm6
- lea r13,[96+r13]
-
- movdqa xmm6,XMMWORD[96+rsp]
- jmp NEAR $L$xts_enc_done
-ALIGN 16
-$L$xts_enc_5:
- pxor xmm2,xmm10
- lea r12,[80+r12]
- pxor xmm3,xmm11
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm3,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm5,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm3
- pxor xmm2,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm5
- movdqu XMMWORD[64+r13],xmm2
- lea r13,[80+r13]
-
- movdqa xmm6,XMMWORD[80+rsp]
- jmp NEAR $L$xts_enc_done
-ALIGN 16
-$L$xts_enc_4:
- pxor xmm1,xmm9
- lea r12,[64+r12]
- pxor xmm2,xmm10
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm3,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm5,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm3
- movdqu XMMWORD[48+r13],xmm5
- lea r13,[64+r13]
-
- movdqa xmm6,XMMWORD[64+rsp]
- jmp NEAR $L$xts_enc_done
-ALIGN 16
-$L$xts_enc_3:
- pxor xmm0,xmm8
- lea r12,[48+r12]
- pxor xmm1,xmm9
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm3,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- movdqu XMMWORD[32+r13],xmm3
- lea r13,[48+r13]
-
- movdqa xmm6,XMMWORD[48+rsp]
- jmp NEAR $L$xts_enc_done
-ALIGN 16
-$L$xts_enc_2:
- pxor xmm15,xmm7
- lea r12,[32+r12]
- pxor xmm0,xmm8
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_encrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- movdqu XMMWORD[16+r13],xmm0
- lea r13,[32+r13]
-
- movdqa xmm6,XMMWORD[32+rsp]
- jmp NEAR $L$xts_enc_done
-ALIGN 16
-$L$xts_enc_1:
- pxor xmm7,xmm15
- lea r12,[16+r12]
- movdqa XMMWORD[32+rbp],xmm7
- lea rcx,[32+rbp]
- lea rdx,[32+rbp]
- lea r8,[r15]
- call aes_nohw_encrypt
- pxor xmm15,XMMWORD[32+rbp]
-
-
-
-
-
- movdqu XMMWORD[r13],xmm15
- lea r13,[16+r13]
-
- movdqa xmm6,XMMWORD[16+rsp]
-
-$L$xts_enc_done:
- and ebx,15
- jz NEAR $L$xts_enc_ret
- mov rdx,r13
-
-$L$xts_enc_steal:
- movzx eax,BYTE[r12]
- movzx ecx,BYTE[((-16))+rdx]
- lea r12,[1+r12]
- mov BYTE[((-16))+rdx],al
- mov BYTE[rdx],cl
- lea rdx,[1+rdx]
- sub ebx,1
- jnz NEAR $L$xts_enc_steal
-
- movdqu xmm15,XMMWORD[((-16))+r13]
- lea rcx,[32+rbp]
- pxor xmm15,xmm6
- lea rdx,[32+rbp]
- movdqa XMMWORD[32+rbp],xmm15
- lea r8,[r15]
- call aes_nohw_encrypt
- pxor xmm6,XMMWORD[32+rbp]
- movdqu XMMWORD[(-16)+r13],xmm6
-
-$L$xts_enc_ret:
- lea rax,[rsp]
- pxor xmm0,xmm0
-$L$xts_enc_bzero:
- movdqa XMMWORD[rax],xmm0
- movdqa XMMWORD[16+rax],xmm0
- lea rax,[32+rax]
- cmp rbp,rax
- ja NEAR $L$xts_enc_bzero
-
- lea rax,[120+rbp]
-
- movaps xmm6,XMMWORD[64+rbp]
- movaps xmm7,XMMWORD[80+rbp]
- movaps xmm8,XMMWORD[96+rbp]
- movaps xmm9,XMMWORD[112+rbp]
- movaps xmm10,XMMWORD[128+rbp]
- movaps xmm11,XMMWORD[144+rbp]
- movaps xmm12,XMMWORD[160+rbp]
- movaps xmm13,XMMWORD[176+rbp]
- movaps xmm14,XMMWORD[192+rbp]
- movaps xmm15,XMMWORD[208+rbp]
- lea rax,[160+rax]
-$L$xts_enc_tail:
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbx,QWORD[((-16))+rax]
-
- mov rbp,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$xts_enc_epilogue:
- DB 0F3h,0C3h ;repret
-
-
-
-global bsaes_xts_decrypt
-
-ALIGN 16
-bsaes_xts_decrypt:
-
- mov rax,rsp
-$L$xts_dec_prologue:
- push rbp
-
- push rbx
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- lea rsp,[((-72))+rsp]
-
- mov r10,QWORD[160+rsp]
- mov r11,QWORD[168+rsp]
- lea rsp,[((-160))+rsp]
- movaps XMMWORD[64+rsp],xmm6
- movaps XMMWORD[80+rsp],xmm7
- movaps XMMWORD[96+rsp],xmm8
- movaps XMMWORD[112+rsp],xmm9
- movaps XMMWORD[128+rsp],xmm10
- movaps XMMWORD[144+rsp],xmm11
- movaps XMMWORD[160+rsp],xmm12
- movaps XMMWORD[176+rsp],xmm13
- movaps XMMWORD[192+rsp],xmm14
- movaps XMMWORD[208+rsp],xmm15
-$L$xts_dec_body:
- mov rbp,rsp
- mov r12,rcx
- mov r13,rdx
- mov r14,r8
- mov r15,r9
-
- lea rcx,[r11]
- lea rdx,[32+rbp]
- lea r8,[r10]
- call aes_nohw_encrypt
-
- mov eax,DWORD[240+r15]
- mov rbx,r14
-
- mov edx,eax
- shl rax,7
- sub rax,96
- sub rsp,rax
-
- mov rax,rsp
- mov rcx,r15
- mov r10d,edx
- call _bsaes_key_convert
- pxor xmm7,XMMWORD[rsp]
- movdqa XMMWORD[rax],xmm6
- movdqa XMMWORD[rsp],xmm7
-
- xor eax,eax
- and r14,-16
- test ebx,15
- setnz al
- shl rax,4
- sub r14,rax
-
- sub rsp,0x80
- movdqa xmm6,XMMWORD[32+rbp]
-
- pxor xmm14,xmm14
- movdqa xmm12,XMMWORD[$L$xts_magic]
- pcmpgtd xmm14,xmm6
-
- sub r14,0x80
- jc NEAR $L$xts_dec_short
- jmp NEAR $L$xts_dec_loop
-
-ALIGN 16
-$L$xts_dec_loop:
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm15,xmm6
- movdqa XMMWORD[rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm0,xmm6
- movdqa XMMWORD[16+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm7,XMMWORD[r12]
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm1,xmm6
- movdqa XMMWORD[32+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm8,XMMWORD[16+r12]
- pxor xmm15,xmm7
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm2,xmm6
- movdqa XMMWORD[48+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm9,XMMWORD[32+r12]
- pxor xmm0,xmm8
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm3,xmm6
- movdqa XMMWORD[64+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm10,XMMWORD[48+r12]
- pxor xmm1,xmm9
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm4,xmm6
- movdqa XMMWORD[80+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm11,XMMWORD[64+r12]
- pxor xmm2,xmm10
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm5,xmm6
- movdqa XMMWORD[96+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm12,XMMWORD[80+r12]
- pxor xmm3,xmm11
- movdqu xmm13,XMMWORD[96+r12]
- pxor xmm4,xmm12
- movdqu xmm14,XMMWORD[112+r12]
- lea r12,[128+r12]
- movdqa XMMWORD[112+rsp],xmm6
- pxor xmm5,xmm13
- lea rax,[128+rsp]
- pxor xmm6,xmm14
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm5,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm3,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm5
- pxor xmm1,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm3
- pxor xmm6,XMMWORD[80+rsp]
- movdqu XMMWORD[64+r13],xmm1
- pxor xmm2,XMMWORD[96+rsp]
- movdqu XMMWORD[80+r13],xmm6
- pxor xmm4,XMMWORD[112+rsp]
- movdqu XMMWORD[96+r13],xmm2
- movdqu XMMWORD[112+r13],xmm4
- lea r13,[128+r13]
-
- movdqa xmm6,XMMWORD[112+rsp]
- pxor xmm14,xmm14
- movdqa xmm12,XMMWORD[$L$xts_magic]
- pcmpgtd xmm14,xmm6
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
-
- sub r14,0x80
- jnc NEAR $L$xts_dec_loop
-
-$L$xts_dec_short:
- add r14,0x80
- jz NEAR $L$xts_dec_done
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm15,xmm6
- movdqa XMMWORD[rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm0,xmm6
- movdqa XMMWORD[16+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm7,XMMWORD[r12]
- cmp r14,16
- je NEAR $L$xts_dec_1
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm1,xmm6
- movdqa XMMWORD[32+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm8,XMMWORD[16+r12]
- cmp r14,32
- je NEAR $L$xts_dec_2
- pxor xmm15,xmm7
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm2,xmm6
- movdqa XMMWORD[48+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm9,XMMWORD[32+r12]
- cmp r14,48
- je NEAR $L$xts_dec_3
- pxor xmm0,xmm8
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm3,xmm6
- movdqa XMMWORD[64+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm10,XMMWORD[48+r12]
- cmp r14,64
- je NEAR $L$xts_dec_4
- pxor xmm1,xmm9
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm4,xmm6
- movdqa XMMWORD[80+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm11,XMMWORD[64+r12]
- cmp r14,80
- je NEAR $L$xts_dec_5
- pxor xmm2,xmm10
- pshufd xmm13,xmm14,0x13
- pxor xmm14,xmm14
- movdqa xmm5,xmm6
- movdqa XMMWORD[96+rsp],xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- pcmpgtd xmm14,xmm6
- pxor xmm6,xmm13
- movdqu xmm12,XMMWORD[80+r12]
- cmp r14,96
- je NEAR $L$xts_dec_6
- pxor xmm3,xmm11
- movdqu xmm13,XMMWORD[96+r12]
- pxor xmm4,xmm12
- movdqa XMMWORD[112+rsp],xmm6
- lea r12,[112+r12]
- pxor xmm5,xmm13
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm5,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm3,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm5
- pxor xmm1,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm3
- pxor xmm6,XMMWORD[80+rsp]
- movdqu XMMWORD[64+r13],xmm1
- pxor xmm2,XMMWORD[96+rsp]
- movdqu XMMWORD[80+r13],xmm6
- movdqu XMMWORD[96+r13],xmm2
- lea r13,[112+r13]
-
- movdqa xmm6,XMMWORD[112+rsp]
- jmp NEAR $L$xts_dec_done
-ALIGN 16
-$L$xts_dec_6:
- pxor xmm3,xmm11
- lea r12,[96+r12]
- pxor xmm4,xmm12
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm5,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm3,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm5
- pxor xmm1,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm3
- pxor xmm6,XMMWORD[80+rsp]
- movdqu XMMWORD[64+r13],xmm1
- movdqu XMMWORD[80+r13],xmm6
- lea r13,[96+r13]
-
- movdqa xmm6,XMMWORD[96+rsp]
- jmp NEAR $L$xts_dec_done
-ALIGN 16
-$L$xts_dec_5:
- pxor xmm2,xmm10
- lea r12,[80+r12]
- pxor xmm3,xmm11
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm5,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm3,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm5
- pxor xmm1,XMMWORD[64+rsp]
- movdqu XMMWORD[48+r13],xmm3
- movdqu XMMWORD[64+r13],xmm1
- lea r13,[80+r13]
-
- movdqa xmm6,XMMWORD[80+rsp]
- jmp NEAR $L$xts_dec_done
-ALIGN 16
-$L$xts_dec_4:
- pxor xmm1,xmm9
- lea r12,[64+r12]
- pxor xmm2,xmm10
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm5,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- pxor xmm3,XMMWORD[48+rsp]
- movdqu XMMWORD[32+r13],xmm5
- movdqu XMMWORD[48+r13],xmm3
- lea r13,[64+r13]
-
- movdqa xmm6,XMMWORD[64+rsp]
- jmp NEAR $L$xts_dec_done
-ALIGN 16
-$L$xts_dec_3:
- pxor xmm0,xmm8
- lea r12,[48+r12]
- pxor xmm1,xmm9
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- pxor xmm5,XMMWORD[32+rsp]
- movdqu XMMWORD[16+r13],xmm0
- movdqu XMMWORD[32+r13],xmm5
- lea r13,[48+r13]
-
- movdqa xmm6,XMMWORD[48+rsp]
- jmp NEAR $L$xts_dec_done
-ALIGN 16
-$L$xts_dec_2:
- pxor xmm15,xmm7
- lea r12,[32+r12]
- pxor xmm0,xmm8
- lea rax,[128+rsp]
- mov r10d,edx
-
- call _bsaes_decrypt8
-
- pxor xmm15,XMMWORD[rsp]
- pxor xmm0,XMMWORD[16+rsp]
- movdqu XMMWORD[r13],xmm15
- movdqu XMMWORD[16+r13],xmm0
- lea r13,[32+r13]
-
- movdqa xmm6,XMMWORD[32+rsp]
- jmp NEAR $L$xts_dec_done
-ALIGN 16
-$L$xts_dec_1:
- pxor xmm7,xmm15
- lea r12,[16+r12]
- movdqa XMMWORD[32+rbp],xmm7
- lea rcx,[32+rbp]
- lea rdx,[32+rbp]
- lea r8,[r15]
- call aes_nohw_decrypt
- pxor xmm15,XMMWORD[32+rbp]
-
-
-
-
-
- movdqu XMMWORD[r13],xmm15
- lea r13,[16+r13]
-
- movdqa xmm6,XMMWORD[16+rsp]
-
-$L$xts_dec_done:
- and ebx,15
- jz NEAR $L$xts_dec_ret
-
- pxor xmm14,xmm14
- movdqa xmm12,XMMWORD[$L$xts_magic]
- pcmpgtd xmm14,xmm6
- pshufd xmm13,xmm14,0x13
- movdqa xmm5,xmm6
- paddq xmm6,xmm6
- pand xmm13,xmm12
- movdqu xmm15,XMMWORD[r12]
- pxor xmm6,xmm13
-
- lea rcx,[32+rbp]
- pxor xmm15,xmm6
- lea rdx,[32+rbp]
- movdqa XMMWORD[32+rbp],xmm15
- lea r8,[r15]
- call aes_nohw_decrypt
- pxor xmm6,XMMWORD[32+rbp]
- mov rdx,r13
- movdqu XMMWORD[r13],xmm6
-
-$L$xts_dec_steal:
- movzx eax,BYTE[16+r12]
- movzx ecx,BYTE[rdx]
- lea r12,[1+r12]
- mov BYTE[rdx],al
- mov BYTE[16+rdx],cl
- lea rdx,[1+rdx]
- sub ebx,1
- jnz NEAR $L$xts_dec_steal
-
- movdqu xmm15,XMMWORD[r13]
- lea rcx,[32+rbp]
- pxor xmm15,xmm5
- lea rdx,[32+rbp]
- movdqa XMMWORD[32+rbp],xmm15
- lea r8,[r15]
- call aes_nohw_decrypt
- pxor xmm5,XMMWORD[32+rbp]
- movdqu XMMWORD[r13],xmm5
-
-$L$xts_dec_ret:
- lea rax,[rsp]
- pxor xmm0,xmm0
-$L$xts_dec_bzero:
- movdqa XMMWORD[rax],xmm0
- movdqa XMMWORD[16+rax],xmm0
- lea rax,[32+rax]
- cmp rbp,rax
- ja NEAR $L$xts_dec_bzero
-
- lea rax,[120+rbp]
-
- movaps xmm6,XMMWORD[64+rbp]
- movaps xmm7,XMMWORD[80+rbp]
- movaps xmm8,XMMWORD[96+rbp]
- movaps xmm9,XMMWORD[112+rbp]
- movaps xmm10,XMMWORD[128+rbp]
- movaps xmm11,XMMWORD[144+rbp]
- movaps xmm12,XMMWORD[160+rbp]
- movaps xmm13,XMMWORD[176+rbp]
- movaps xmm14,XMMWORD[192+rbp]
- movaps xmm15,XMMWORD[208+rbp]
- lea rax,[160+rax]
-$L$xts_dec_tail:
- mov r15,QWORD[((-48))+rax]
-
- mov r14,QWORD[((-40))+rax]
-
- mov r13,QWORD[((-32))+rax]
-
- mov r12,QWORD[((-24))+rax]
-
- mov rbx,QWORD[((-16))+rax]
-
- mov rbp,QWORD[((-8))+rax]
-
- lea rsp,[rax]
-
-$L$xts_dec_epilogue:
- DB 0F3h,0C3h ;repret
-
-
ALIGN 64
_bsaes_const:
@@ -2786,14 +1772,6 @@ ALIGN 4
DD $L$ctr_enc_epilogue wrt ..imagebase
DD $L$ctr_enc_info wrt ..imagebase
- DD $L$xts_enc_prologue wrt ..imagebase
- DD $L$xts_enc_epilogue wrt ..imagebase
- DD $L$xts_enc_info wrt ..imagebase
-
- DD $L$xts_dec_prologue wrt ..imagebase
- DD $L$xts_dec_epilogue wrt ..imagebase
- DD $L$xts_dec_info wrt ..imagebase
-
section .xdata rdata align=8
ALIGN 8
$L$cbc_dec_info:
@@ -2808,15 +1786,3 @@ DB 9,0,0,0
DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase
DD $L$ctr_enc_tail wrt ..imagebase
DD 0
-$L$xts_enc_info:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase
- DD $L$xts_enc_tail wrt ..imagebase
- DD 0
-$L$xts_dec_info:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase
- DD $L$xts_dec_tail wrt ..imagebase
- DD 0
diff --git a/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm b/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm
index ea6c4f17..33dc2c2e 100644
--- a/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm
+++ b/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm
@@ -31,8 +31,6 @@ $L$SEH_begin_sha512_block_data_order:
mov r9d,DWORD[r11]
mov r10d,DWORD[4+r11]
mov r11d,DWORD[8+r11]
- test r10d,2048
- jnz NEAR $L$xop_shortcut
and r9d,1073741824
and r10d,268435968
or r10d,r9d
@@ -1839,1130 +1837,6 @@ DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
DB 111,114,103,62,0
ALIGN 64
-sha512_block_data_order_xop:
- mov QWORD[8+rsp],rdi ;WIN64 prologue
- mov QWORD[16+rsp],rsi
- mov rax,rsp
-$L$SEH_begin_sha512_block_data_order_xop:
- mov rdi,rcx
- mov rsi,rdx
- mov rdx,r8
-
-
-
-$L$xop_shortcut:
- mov rax,rsp
-
- push rbx
-
- push rbp
-
- push r12
-
- push r13
-
- push r14
-
- push r15
-
- shl rdx,4
- sub rsp,256
- lea rdx,[rdx*8+rsi]
- and rsp,-64
- mov QWORD[((128+0))+rsp],rdi
- mov QWORD[((128+8))+rsp],rsi
- mov QWORD[((128+16))+rsp],rdx
- mov QWORD[152+rsp],rax
-
- movaps XMMWORD[(128+32)+rsp],xmm6
- movaps XMMWORD[(128+48)+rsp],xmm7
- movaps XMMWORD[(128+64)+rsp],xmm8
- movaps XMMWORD[(128+80)+rsp],xmm9
- movaps XMMWORD[(128+96)+rsp],xmm10
- movaps XMMWORD[(128+112)+rsp],xmm11
-$L$prologue_xop:
-
- vzeroupper
- mov rax,QWORD[rdi]
- mov rbx,QWORD[8+rdi]
- mov rcx,QWORD[16+rdi]
- mov rdx,QWORD[24+rdi]
- mov r8,QWORD[32+rdi]
- mov r9,QWORD[40+rdi]
- mov r10,QWORD[48+rdi]
- mov r11,QWORD[56+rdi]
- jmp NEAR $L$loop_xop
-ALIGN 16
-$L$loop_xop:
- vmovdqa xmm11,XMMWORD[((K512+1280))]
- vmovdqu xmm0,XMMWORD[rsi]
- lea rbp,[((K512+128))]
- vmovdqu xmm1,XMMWORD[16+rsi]
- vmovdqu xmm2,XMMWORD[32+rsi]
- vpshufb xmm0,xmm0,xmm11
- vmovdqu xmm3,XMMWORD[48+rsi]
- vpshufb xmm1,xmm1,xmm11
- vmovdqu xmm4,XMMWORD[64+rsi]
- vpshufb xmm2,xmm2,xmm11
- vmovdqu xmm5,XMMWORD[80+rsi]
- vpshufb xmm3,xmm3,xmm11
- vmovdqu xmm6,XMMWORD[96+rsi]
- vpshufb xmm4,xmm4,xmm11
- vmovdqu xmm7,XMMWORD[112+rsi]
- vpshufb xmm5,xmm5,xmm11
- vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp]
- vpshufb xmm6,xmm6,xmm11
- vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp]
- vpshufb xmm7,xmm7,xmm11
- vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
- vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp]
- vmovdqa XMMWORD[rsp],xmm8
- vpaddq xmm8,xmm4,XMMWORD[rbp]
- vmovdqa XMMWORD[16+rsp],xmm9
- vpaddq xmm9,xmm5,XMMWORD[32+rbp]
- vmovdqa XMMWORD[32+rsp],xmm10
- vpaddq xmm10,xmm6,XMMWORD[64+rbp]
- vmovdqa XMMWORD[48+rsp],xmm11
- vpaddq xmm11,xmm7,XMMWORD[96+rbp]
- vmovdqa XMMWORD[64+rsp],xmm8
- mov r14,rax
- vmovdqa XMMWORD[80+rsp],xmm9
- mov rdi,rbx
- vmovdqa XMMWORD[96+rsp],xmm10
- xor rdi,rcx
- vmovdqa XMMWORD[112+rsp],xmm11
- mov r13,r8
- jmp NEAR $L$xop_00_47
-
-ALIGN 16
-$L$xop_00_47:
- add rbp,256
- vpalignr xmm8,xmm1,xmm0,8
- ror r13,23
- mov rax,r14
- vpalignr xmm11,xmm5,xmm4,8
- mov r12,r9
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r8
- xor r12,r10
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rax
- vpaddq xmm0,xmm0,xmm11
- and r12,r8
- xor r13,r8
- add r11,QWORD[rsp]
- mov r15,rax
-DB 143,72,120,195,209,7
- xor r12,r10
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,223,3
- xor r14,rax
- add r11,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rbx
- ror r14,28
- vpsrlq xmm10,xmm7,6
- add rdx,r11
- add r11,rdi
- vpaddq xmm0,xmm0,xmm8
- mov r13,rdx
- add r14,r11
-DB 143,72,120,195,203,42
- ror r13,23
- mov r11,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- vpaddq xmm0,xmm0,xmm11
- add r10,QWORD[8+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp]
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- vmovdqa XMMWORD[rsp],xmm10
- vpalignr xmm8,xmm2,xmm1,8
- ror r13,23
- mov r10,r14
- vpalignr xmm11,xmm6,xmm5,8
- mov r12,rdx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rcx
- xor r12,r8
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r10
- vpaddq xmm1,xmm1,xmm11
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[16+rsp]
- mov r15,r10
-DB 143,72,120,195,209,7
- xor r12,r8
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,216,3
- xor r14,r10
- add r9,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r11
- ror r14,28
- vpsrlq xmm10,xmm0,6
- add rbx,r9
- add r9,rdi
- vpaddq xmm1,xmm1,xmm8
- mov r13,rbx
- add r14,r9
-DB 143,72,120,195,203,42
- ror r13,23
- mov r9,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- vpaddq xmm1,xmm1,xmm11
- add r8,QWORD[24+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp]
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- vmovdqa XMMWORD[16+rsp],xmm10
- vpalignr xmm8,xmm3,xmm2,8
- ror r13,23
- mov r8,r14
- vpalignr xmm11,xmm7,xmm6,8
- mov r12,rbx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rax
- xor r12,rcx
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r8
- vpaddq xmm2,xmm2,xmm11
- and r12,rax
- xor r13,rax
- add rdx,QWORD[32+rsp]
- mov r15,r8
-DB 143,72,120,195,209,7
- xor r12,rcx
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,217,3
- xor r14,r8
- add rdx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r9
- ror r14,28
- vpsrlq xmm10,xmm1,6
- add r11,rdx
- add rdx,rdi
- vpaddq xmm2,xmm2,xmm8
- mov r13,r11
- add r14,rdx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rdx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- vpaddq xmm2,xmm2,xmm11
- add rcx,QWORD[40+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp]
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- vmovdqa XMMWORD[32+rsp],xmm10
- vpalignr xmm8,xmm4,xmm3,8
- ror r13,23
- mov rcx,r14
- vpalignr xmm11,xmm0,xmm7,8
- mov r12,r11
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r10
- xor r12,rax
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rcx
- vpaddq xmm3,xmm3,xmm11
- and r12,r10
- xor r13,r10
- add rbx,QWORD[48+rsp]
- mov r15,rcx
-DB 143,72,120,195,209,7
- xor r12,rax
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,218,3
- xor r14,rcx
- add rbx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rdx
- ror r14,28
- vpsrlq xmm10,xmm2,6
- add r9,rbx
- add rbx,rdi
- vpaddq xmm3,xmm3,xmm8
- mov r13,r9
- add r14,rbx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rbx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- vpaddq xmm3,xmm3,xmm11
- add rax,QWORD[56+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp]
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- vmovdqa XMMWORD[48+rsp],xmm10
- vpalignr xmm8,xmm5,xmm4,8
- ror r13,23
- mov rax,r14
- vpalignr xmm11,xmm1,xmm0,8
- mov r12,r9
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r8
- xor r12,r10
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rax
- vpaddq xmm4,xmm4,xmm11
- and r12,r8
- xor r13,r8
- add r11,QWORD[64+rsp]
- mov r15,rax
-DB 143,72,120,195,209,7
- xor r12,r10
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,219,3
- xor r14,rax
- add r11,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rbx
- ror r14,28
- vpsrlq xmm10,xmm3,6
- add rdx,r11
- add r11,rdi
- vpaddq xmm4,xmm4,xmm8
- mov r13,rdx
- add r14,r11
-DB 143,72,120,195,203,42
- ror r13,23
- mov r11,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- vpaddq xmm4,xmm4,xmm11
- add r10,QWORD[72+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- vpaddq xmm10,xmm4,XMMWORD[rbp]
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- vmovdqa XMMWORD[64+rsp],xmm10
- vpalignr xmm8,xmm6,xmm5,8
- ror r13,23
- mov r10,r14
- vpalignr xmm11,xmm2,xmm1,8
- mov r12,rdx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rcx
- xor r12,r8
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r10
- vpaddq xmm5,xmm5,xmm11
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[80+rsp]
- mov r15,r10
-DB 143,72,120,195,209,7
- xor r12,r8
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,220,3
- xor r14,r10
- add r9,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r11
- ror r14,28
- vpsrlq xmm10,xmm4,6
- add rbx,r9
- add r9,rdi
- vpaddq xmm5,xmm5,xmm8
- mov r13,rbx
- add r14,r9
-DB 143,72,120,195,203,42
- ror r13,23
- mov r9,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- vpaddq xmm5,xmm5,xmm11
- add r8,QWORD[88+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- vpaddq xmm10,xmm5,XMMWORD[32+rbp]
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- vmovdqa XMMWORD[80+rsp],xmm10
- vpalignr xmm8,xmm7,xmm6,8
- ror r13,23
- mov r8,r14
- vpalignr xmm11,xmm3,xmm2,8
- mov r12,rbx
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,rax
- xor r12,rcx
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,r8
- vpaddq xmm6,xmm6,xmm11
- and r12,rax
- xor r13,rax
- add rdx,QWORD[96+rsp]
- mov r15,r8
-DB 143,72,120,195,209,7
- xor r12,rcx
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,221,3
- xor r14,r8
- add rdx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,r9
- ror r14,28
- vpsrlq xmm10,xmm5,6
- add r11,rdx
- add rdx,rdi
- vpaddq xmm6,xmm6,xmm8
- mov r13,r11
- add r14,rdx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rdx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- vpaddq xmm6,xmm6,xmm11
- add rcx,QWORD[104+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- vpaddq xmm10,xmm6,XMMWORD[64+rbp]
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- vmovdqa XMMWORD[96+rsp],xmm10
- vpalignr xmm8,xmm0,xmm7,8
- ror r13,23
- mov rcx,r14
- vpalignr xmm11,xmm4,xmm3,8
- mov r12,r11
- ror r14,5
-DB 143,72,120,195,200,56
- xor r13,r10
- xor r12,rax
- vpsrlq xmm8,xmm8,7
- ror r13,4
- xor r14,rcx
- vpaddq xmm7,xmm7,xmm11
- and r12,r10
- xor r13,r10
- add rbx,QWORD[112+rsp]
- mov r15,rcx
-DB 143,72,120,195,209,7
- xor r12,rax
- ror r14,6
- vpxor xmm8,xmm8,xmm9
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
-DB 143,104,120,195,222,3
- xor r14,rcx
- add rbx,r13
- vpxor xmm8,xmm8,xmm10
- xor rdi,rdx
- ror r14,28
- vpsrlq xmm10,xmm6,6
- add r9,rbx
- add rbx,rdi
- vpaddq xmm7,xmm7,xmm8
- mov r13,r9
- add r14,rbx
-DB 143,72,120,195,203,42
- ror r13,23
- mov rbx,r14
- vpxor xmm11,xmm11,xmm10
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- vpxor xmm11,xmm11,xmm9
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- vpaddq xmm7,xmm7,xmm11
- add rax,QWORD[120+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- vpaddq xmm10,xmm7,XMMWORD[96+rbp]
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- vmovdqa XMMWORD[112+rsp],xmm10
- cmp BYTE[135+rbp],0
- jne NEAR $L$xop_00_47
- ror r13,23
- mov rax,r14
- mov r12,r9
- ror r14,5
- xor r13,r8
- xor r12,r10
- ror r13,4
- xor r14,rax
- and r12,r8
- xor r13,r8
- add r11,QWORD[rsp]
- mov r15,rax
- xor r12,r10
- ror r14,6
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
- xor r14,rax
- add r11,r13
- xor rdi,rbx
- ror r14,28
- add rdx,r11
- add r11,rdi
- mov r13,rdx
- add r14,r11
- ror r13,23
- mov r11,r14
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- add r10,QWORD[8+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- ror r13,23
- mov r10,r14
- mov r12,rdx
- ror r14,5
- xor r13,rcx
- xor r12,r8
- ror r13,4
- xor r14,r10
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[16+rsp]
- mov r15,r10
- xor r12,r8
- ror r14,6
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
- xor r14,r10
- add r9,r13
- xor rdi,r11
- ror r14,28
- add rbx,r9
- add r9,rdi
- mov r13,rbx
- add r14,r9
- ror r13,23
- mov r9,r14
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- add r8,QWORD[24+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- ror r13,23
- mov r8,r14
- mov r12,rbx
- ror r14,5
- xor r13,rax
- xor r12,rcx
- ror r13,4
- xor r14,r8
- and r12,rax
- xor r13,rax
- add rdx,QWORD[32+rsp]
- mov r15,r8
- xor r12,rcx
- ror r14,6
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
- xor r14,r8
- add rdx,r13
- xor rdi,r9
- ror r14,28
- add r11,rdx
- add rdx,rdi
- mov r13,r11
- add r14,rdx
- ror r13,23
- mov rdx,r14
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- add rcx,QWORD[40+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- ror r13,23
- mov rcx,r14
- mov r12,r11
- ror r14,5
- xor r13,r10
- xor r12,rax
- ror r13,4
- xor r14,rcx
- and r12,r10
- xor r13,r10
- add rbx,QWORD[48+rsp]
- mov r15,rcx
- xor r12,rax
- ror r14,6
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
- xor r14,rcx
- add rbx,r13
- xor rdi,rdx
- ror r14,28
- add r9,rbx
- add rbx,rdi
- mov r13,r9
- add r14,rbx
- ror r13,23
- mov rbx,r14
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- add rax,QWORD[56+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- ror r13,23
- mov rax,r14
- mov r12,r9
- ror r14,5
- xor r13,r8
- xor r12,r10
- ror r13,4
- xor r14,rax
- and r12,r8
- xor r13,r8
- add r11,QWORD[64+rsp]
- mov r15,rax
- xor r12,r10
- ror r14,6
- xor r15,rbx
- add r11,r12
- ror r13,14
- and rdi,r15
- xor r14,rax
- add r11,r13
- xor rdi,rbx
- ror r14,28
- add rdx,r11
- add r11,rdi
- mov r13,rdx
- add r14,r11
- ror r13,23
- mov r11,r14
- mov r12,r8
- ror r14,5
- xor r13,rdx
- xor r12,r9
- ror r13,4
- xor r14,r11
- and r12,rdx
- xor r13,rdx
- add r10,QWORD[72+rsp]
- mov rdi,r11
- xor r12,r9
- ror r14,6
- xor rdi,rax
- add r10,r12
- ror r13,14
- and r15,rdi
- xor r14,r11
- add r10,r13
- xor r15,rax
- ror r14,28
- add rcx,r10
- add r10,r15
- mov r13,rcx
- add r14,r10
- ror r13,23
- mov r10,r14
- mov r12,rdx
- ror r14,5
- xor r13,rcx
- xor r12,r8
- ror r13,4
- xor r14,r10
- and r12,rcx
- xor r13,rcx
- add r9,QWORD[80+rsp]
- mov r15,r10
- xor r12,r8
- ror r14,6
- xor r15,r11
- add r9,r12
- ror r13,14
- and rdi,r15
- xor r14,r10
- add r9,r13
- xor rdi,r11
- ror r14,28
- add rbx,r9
- add r9,rdi
- mov r13,rbx
- add r14,r9
- ror r13,23
- mov r9,r14
- mov r12,rcx
- ror r14,5
- xor r13,rbx
- xor r12,rdx
- ror r13,4
- xor r14,r9
- and r12,rbx
- xor r13,rbx
- add r8,QWORD[88+rsp]
- mov rdi,r9
- xor r12,rdx
- ror r14,6
- xor rdi,r10
- add r8,r12
- ror r13,14
- and r15,rdi
- xor r14,r9
- add r8,r13
- xor r15,r10
- ror r14,28
- add rax,r8
- add r8,r15
- mov r13,rax
- add r14,r8
- ror r13,23
- mov r8,r14
- mov r12,rbx
- ror r14,5
- xor r13,rax
- xor r12,rcx
- ror r13,4
- xor r14,r8
- and r12,rax
- xor r13,rax
- add rdx,QWORD[96+rsp]
- mov r15,r8
- xor r12,rcx
- ror r14,6
- xor r15,r9
- add rdx,r12
- ror r13,14
- and rdi,r15
- xor r14,r8
- add rdx,r13
- xor rdi,r9
- ror r14,28
- add r11,rdx
- add rdx,rdi
- mov r13,r11
- add r14,rdx
- ror r13,23
- mov rdx,r14
- mov r12,rax
- ror r14,5
- xor r13,r11
- xor r12,rbx
- ror r13,4
- xor r14,rdx
- and r12,r11
- xor r13,r11
- add rcx,QWORD[104+rsp]
- mov rdi,rdx
- xor r12,rbx
- ror r14,6
- xor rdi,r8
- add rcx,r12
- ror r13,14
- and r15,rdi
- xor r14,rdx
- add rcx,r13
- xor r15,r8
- ror r14,28
- add r10,rcx
- add rcx,r15
- mov r13,r10
- add r14,rcx
- ror r13,23
- mov rcx,r14
- mov r12,r11
- ror r14,5
- xor r13,r10
- xor r12,rax
- ror r13,4
- xor r14,rcx
- and r12,r10
- xor r13,r10
- add rbx,QWORD[112+rsp]
- mov r15,rcx
- xor r12,rax
- ror r14,6
- xor r15,rdx
- add rbx,r12
- ror r13,14
- and rdi,r15
- xor r14,rcx
- add rbx,r13
- xor rdi,rdx
- ror r14,28
- add r9,rbx
- add rbx,rdi
- mov r13,r9
- add r14,rbx
- ror r13,23
- mov rbx,r14
- mov r12,r10
- ror r14,5
- xor r13,r9
- xor r12,r11
- ror r13,4
- xor r14,rbx
- and r12,r9
- xor r13,r9
- add rax,QWORD[120+rsp]
- mov rdi,rbx
- xor r12,r11
- ror r14,6
- xor rdi,rcx
- add rax,r12
- ror r13,14
- and r15,rdi
- xor r14,rbx
- add rax,r13
- xor r15,rcx
- ror r14,28
- add r8,rax
- add rax,r15
- mov r13,r8
- add r14,rax
- mov rdi,QWORD[((128+0))+rsp]
- mov rax,r14
-
- add rax,QWORD[rdi]
- lea rsi,[128+rsi]
- add rbx,QWORD[8+rdi]
- add rcx,QWORD[16+rdi]
- add rdx,QWORD[24+rdi]
- add r8,QWORD[32+rdi]
- add r9,QWORD[40+rdi]
- add r10,QWORD[48+rdi]
- add r11,QWORD[56+rdi]
-
- cmp rsi,QWORD[((128+16))+rsp]
-
- mov QWORD[rdi],rax
- mov QWORD[8+rdi],rbx
- mov QWORD[16+rdi],rcx
- mov QWORD[24+rdi],rdx
- mov QWORD[32+rdi],r8
- mov QWORD[40+rdi],r9
- mov QWORD[48+rdi],r10
- mov QWORD[56+rdi],r11
- jb NEAR $L$loop_xop
-
- mov rsi,QWORD[152+rsp]
-
- vzeroupper
- movaps xmm6,XMMWORD[((128+32))+rsp]
- movaps xmm7,XMMWORD[((128+48))+rsp]
- movaps xmm8,XMMWORD[((128+64))+rsp]
- movaps xmm9,XMMWORD[((128+80))+rsp]
- movaps xmm10,XMMWORD[((128+96))+rsp]
- movaps xmm11,XMMWORD[((128+112))+rsp]
- mov r15,QWORD[((-48))+rsi]
-
- mov r14,QWORD[((-40))+rsi]
-
- mov r13,QWORD[((-32))+rsi]
-
- mov r12,QWORD[((-24))+rsi]
-
- mov rbp,QWORD[((-16))+rsi]
-
- mov rbx,QWORD[((-8))+rsi]
-
- lea rsp,[rsi]
-
-$L$epilogue_xop:
- mov rdi,QWORD[8+rsp] ;WIN64 epilogue
- mov rsi,QWORD[16+rsp]
- DB 0F3h,0C3h ;repret
-
-$L$SEH_end_sha512_block_data_order_xop:
-
-ALIGN 64
sha512_block_data_order_avx:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
@@ -4250,9 +3124,6 @@ ALIGN 4
DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase
DD $L$SEH_end_sha512_block_data_order wrt ..imagebase
DD $L$SEH_info_sha512_block_data_order wrt ..imagebase
- DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase
- DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase
- DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase
DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase
DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase
DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase
@@ -4262,10 +3133,6 @@ $L$SEH_info_sha512_block_data_order:
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
-$L$SEH_info_sha512_block_data_order_xop:
-DB 9,0,0,0
- DD se_handler wrt ..imagebase
- DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase
$L$SEH_info_sha512_block_data_order_avx:
DB 9,0,0,0
DD se_handler wrt ..imagebase