diff options
author | Robert Sloan <varomodt@google.com> | 2018-12-19 01:46:14 -0800 |
---|---|---|
committer | android-build-merger <android-build-merger@google.com> | 2018-12-19 01:46:14 -0800 |
commit | 8aa0d177a330be513f42081bc0017a354ca2b2c0 (patch) | |
tree | 37e1b45a3f85d38626637fe6dd2bdca7426d4ba5 | |
parent | de1a5d938d7ae916949f39a7c4cc4dc41356f66e (diff) | |
parent | 11c28bd346323429220e1d3de42163868d83d0cd (diff) | |
download | boringssl-8aa0d177a330be513f42081bc0017a354ca2b2c0.tar.gz |
external/boringssl: Sync to 41c10e2b5f37edce8b9f292f7f3bacb7e30e25c4.
am: 11c28bd346
Change-Id: I3708b6284ba6ad5de6d3b87ab64179922c5126f1
58 files changed, 13604 insertions, 6970 deletions
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION index d6108a3e..f3ca0a33 100644 --- a/BORINGSSL_REVISION +++ b/BORINGSSL_REVISION @@ -1 +1 @@ -0f5ecd3a854546d943104e1f7421e489b7f4d5aa +41c10e2b5f37edce8b9f292f7f3bacb7e30e25c4 @@ -118,6 +118,7 @@ crypto_sources := \ src/crypto/fipsmodule/bcm.c\ src/crypto/fipsmodule/is_fips.c\ src/crypto/hkdf/hkdf.c\ + src/crypto/hrss/hrss.c\ src/crypto/lhash/lhash.c\ src/crypto/mem.c\ src/crypto/obj/obj.c\ @@ -354,4 +355,5 @@ linux_x86_64_sources := \ linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S\ linux-x86_64/crypto/fipsmodule/x86_64-mont.S\ linux-x86_64/crypto/fipsmodule/x86_64-mont5.S\ + src/crypto/hrss/asm/poly_rq_mul.S\ diff --git a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S index 7dd3161b..36c01ef9 100644 --- a/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -1576,982 +1576,6 @@ bsaes_ctr32_encrypt_blocks: .byte 0xf3,0xc3 .cfi_endproc .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -.globl bsaes_xts_encrypt -.hidden bsaes_xts_encrypt -.type bsaes_xts_encrypt,@function -.align 16 -bsaes_xts_encrypt: -.cfi_startproc - movq %rsp,%rax -.Lxts_enc_prologue: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -72(%rsp),%rsp -.cfi_adjust_cfa_offset 0x48 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call aes_nohw_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - andq $-16,%r14 - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc .Lxts_enc_short - jmp .Lxts_enc_loop - -.align 16 -.Lxts_enc_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm1,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc .Lxts_enc_loop - -.Lxts_enc_short: - addq $0x80,%r14 - jz .Lxts_enc_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je .Lxts_enc_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je .Lxts_enc_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je .Lxts_enc_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je .Lxts_enc_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je .Lxts_enc_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je .Lxts_enc_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - movdqu %xmm1,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - movdqu %xmm2,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - movdqu %xmm5,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - movdqu %xmm3,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp .Lxts_enc_done -.align 16 -.Lxts_enc_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call aes_nohw_encrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -.Lxts_enc_done: - andl $15,%ebx - jz .Lxts_enc_ret - movq %r13,%rdx - -.Lxts_enc_steal: - movzbl (%r12),%eax - movzbl -16(%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,-16(%rdx) - movb %cl,0(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz .Lxts_enc_steal - - movdqu -16(%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call aes_nohw_encrypt - pxor 32(%rbp),%xmm6 - movdqu %xmm6,-16(%r13) - -.Lxts_enc_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lxts_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lxts_enc_bzero - - leaq 120(%rbp),%rax -.cfi_def_cfa %rax,8 - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbx -.cfi_restore %rbx - movq -8(%rax),%rbp -.cfi_restore %rbp - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lxts_enc_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bsaes_xts_encrypt,.-bsaes_xts_encrypt - -.globl bsaes_xts_decrypt -.hidden bsaes_xts_decrypt -.type bsaes_xts_decrypt,@function -.align 16 -bsaes_xts_decrypt: -.cfi_startproc - movq %rsp,%rax -.Lxts_dec_prologue: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -72(%rsp),%rsp -.cfi_adjust_cfa_offset 0x48 - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call aes_nohw_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - xorl %eax,%eax - andq $-16,%r14 - testl $15,%ebx - setnz %al - shlq $4,%rax - subq %rax,%r14 - - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc .Lxts_dec_short - jmp .Lxts_dec_loop - -.align 16 -.Lxts_dec_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc .Lxts_dec_loop - -.Lxts_dec_short: - addq $0x80,%r14 - jz .Lxts_dec_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je .Lxts_dec_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je .Lxts_dec_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je .Lxts_dec_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je .Lxts_dec_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je .Lxts_dec_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je .Lxts_dec_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp .Lxts_dec_done -.align 16 -.Lxts_dec_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call aes_nohw_decrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -.Lxts_dec_done: - andl $15,%ebx - jz .Lxts_dec_ret - - pxor %xmm14,%xmm14 - movdqa .Lxts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - movdqa %xmm6,%xmm5 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - movdqu (%r12),%xmm15 - pxor %xmm13,%xmm6 - - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call aes_nohw_decrypt - pxor 32(%rbp),%xmm6 - movq %r13,%rdx - movdqu %xmm6,(%r13) - -.Lxts_dec_steal: - movzbl 16(%r12),%eax - movzbl (%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,(%rdx) - movb %cl,16(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz .Lxts_dec_steal - - movdqu (%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm5,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call aes_nohw_decrypt - pxor 32(%rbp),%xmm5 - movdqu %xmm5,(%r13) - -.Lxts_dec_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -.Lxts_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja .Lxts_dec_bzero - - leaq 120(%rbp),%rax -.cfi_def_cfa %rax,8 - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbx -.cfi_restore %rbx - movq -8(%rax),%rbp -.cfi_restore %rbp - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lxts_dec_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bsaes_xts_decrypt,.-bsaes_xts_decrypt .type _bsaes_const,@object .align 64 _bsaes_const: diff --git a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S index 3c47199f..509e144e 100644 --- a/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -25,8 +25,6 @@ sha512_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $2048,%r10d - jnz .Lxop_shortcut andl $1073741824,%r9d andl $268435968,%r10d orl %r9d,%r10d @@ -1825,1107 +1823,6 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha512_block_data_order_xop,@function -.align 64 -sha512_block_data_order_xop: -.cfi_startproc -.Lxop_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 -.Lprologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lxop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_xop - - movq 152(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha512_block_data_order_xop,.-sha512_block_data_order_xop .type sha512_block_data_order_avx,@function .align 64 sha512_block_data_order_avx: diff --git a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S index d0668ca2..0149e0e5 100644 --- a/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S +++ b/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -1561,970 +1561,6 @@ L$ctr_enc_epilogue: .byte 0xf3,0xc3 -.globl _bsaes_xts_encrypt -.private_extern _bsaes_xts_encrypt - -.p2align 4 -_bsaes_xts_encrypt: - - movq %rsp,%rax -L$xts_enc_prologue: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -72(%rsp),%rsp - - movq %rsp,%rbp - - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call _aes_nohw_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor %xmm6,%xmm7 - movdqa %xmm7,(%rax) - - andq $-16,%r14 - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc L$xts_enc_short - jmp L$xts_enc_loop - -.p2align 4 -L$xts_enc_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm1,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc L$xts_enc_loop - -L$xts_enc_short: - addq $0x80,%r14 - jz L$xts_enc_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je L$xts_enc_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je L$xts_enc_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je L$xts_enc_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je L$xts_enc_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je L$xts_enc_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je L$xts_enc_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - pxor 96(%rsp),%xmm1 - movdqu %xmm6,80(%r13) - movdqu %xmm1,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm2,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - pxor 64(%rsp),%xmm2 - movdqu %xmm5,48(%r13) - movdqu %xmm2,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm5 - movdqu %xmm3,32(%r13) - movdqu %xmm5,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm3 - movdqu %xmm0,16(%r13) - movdqu %xmm3,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_encrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp L$xts_enc_done -.p2align 4 -L$xts_enc_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _aes_nohw_encrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -L$xts_enc_done: - andl $15,%ebx - jz L$xts_enc_ret - movq %r13,%rdx - -L$xts_enc_steal: - movzbl (%r12),%eax - movzbl -16(%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,-16(%rdx) - movb %cl,0(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz L$xts_enc_steal - - movdqu -16(%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _aes_nohw_encrypt - pxor 32(%rbp),%xmm6 - movdqu %xmm6,-16(%r13) - -L$xts_enc_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$xts_enc_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$xts_enc_bzero - - leaq 120(%rbp),%rax - - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbx - - movq -8(%rax),%rbp - - leaq (%rax),%rsp - -L$xts_enc_epilogue: - .byte 0xf3,0xc3 - - - -.globl _bsaes_xts_decrypt -.private_extern _bsaes_xts_decrypt - -.p2align 4 -_bsaes_xts_decrypt: - - movq %rsp,%rax -L$xts_dec_prologue: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -72(%rsp),%rsp - - movq %rsp,%rbp - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %rcx,%r15 - - leaq (%r9),%rdi - leaq 32(%rbp),%rsi - leaq (%r8),%rdx - call _aes_nohw_encrypt - - movl 240(%r15),%eax - movq %r14,%rbx - - movl %eax,%edx - shlq $7,%rax - subq $96,%rax - subq %rax,%rsp - - movq %rsp,%rax - movq %r15,%rcx - movl %edx,%r10d - call _bsaes_key_convert - pxor (%rsp),%xmm7 - movdqa %xmm6,(%rax) - movdqa %xmm7,(%rsp) - - xorl %eax,%eax - andq $-16,%r14 - testl $15,%ebx - setnz %al - shlq $4,%rax - subq %rax,%r14 - - subq $0x80,%rsp - movdqa 32(%rbp),%xmm6 - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - - subq $0x80,%r14 - jc L$xts_dec_short - jmp L$xts_dec_loop - -.p2align 4 -L$xts_dec_loop: - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqu 112(%r12),%xmm14 - leaq 128(%r12),%r12 - movdqa %xmm6,112(%rsp) - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - pxor %xmm14,%xmm6 - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - pxor 112(%rsp),%xmm4 - movdqu %xmm2,96(%r13) - movdqu %xmm4,112(%r13) - leaq 128(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - - subq $0x80,%r14 - jnc L$xts_dec_loop - -L$xts_dec_short: - addq $0x80,%r14 - jz L$xts_dec_done - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm15 - movdqa %xmm6,0(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm0 - movdqa %xmm6,16(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 0(%r12),%xmm7 - cmpq $16,%r14 - je L$xts_dec_1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm1 - movdqa %xmm6,32(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 16(%r12),%xmm8 - cmpq $32,%r14 - je L$xts_dec_2 - pxor %xmm7,%xmm15 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm2 - movdqa %xmm6,48(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 32(%r12),%xmm9 - cmpq $48,%r14 - je L$xts_dec_3 - pxor %xmm8,%xmm0 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm3 - movdqa %xmm6,64(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 48(%r12),%xmm10 - cmpq $64,%r14 - je L$xts_dec_4 - pxor %xmm9,%xmm1 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm4 - movdqa %xmm6,80(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 64(%r12),%xmm11 - cmpq $80,%r14 - je L$xts_dec_5 - pxor %xmm10,%xmm2 - pshufd $0x13,%xmm14,%xmm13 - pxor %xmm14,%xmm14 - movdqa %xmm6,%xmm5 - movdqa %xmm6,96(%rsp) - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - pcmpgtd %xmm6,%xmm14 - pxor %xmm13,%xmm6 - movdqu 80(%r12),%xmm12 - cmpq $96,%r14 - je L$xts_dec_6 - pxor %xmm11,%xmm3 - movdqu 96(%r12),%xmm13 - pxor %xmm12,%xmm4 - movdqa %xmm6,112(%rsp) - leaq 112(%r12),%r12 - pxor %xmm13,%xmm5 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - pxor 96(%rsp),%xmm2 - movdqu %xmm6,80(%r13) - movdqu %xmm2,96(%r13) - leaq 112(%r13),%r13 - - movdqa 112(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_6: - pxor %xmm11,%xmm3 - leaq 96(%r12),%r12 - pxor %xmm12,%xmm4 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - pxor 80(%rsp),%xmm6 - movdqu %xmm1,64(%r13) - movdqu %xmm6,80(%r13) - leaq 96(%r13),%r13 - - movdqa 96(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_5: - pxor %xmm10,%xmm2 - leaq 80(%r12),%r12 - pxor %xmm11,%xmm3 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - pxor 64(%rsp),%xmm1 - movdqu %xmm3,48(%r13) - movdqu %xmm1,64(%r13) - leaq 80(%r13),%r13 - - movdqa 80(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_4: - pxor %xmm9,%xmm1 - leaq 64(%r12),%r12 - pxor %xmm10,%xmm2 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - pxor 48(%rsp),%xmm3 - movdqu %xmm5,32(%r13) - movdqu %xmm3,48(%r13) - leaq 64(%r13),%r13 - - movdqa 64(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_3: - pxor %xmm8,%xmm0 - leaq 48(%r12),%r12 - pxor %xmm9,%xmm1 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - pxor 32(%rsp),%xmm5 - movdqu %xmm0,16(%r13) - movdqu %xmm5,32(%r13) - leaq 48(%r13),%r13 - - movdqa 48(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_2: - pxor %xmm7,%xmm15 - leaq 32(%r12),%r12 - pxor %xmm8,%xmm0 - leaq 128(%rsp),%rax - movl %edx,%r10d - - call _bsaes_decrypt8 - - pxor 0(%rsp),%xmm15 - pxor 16(%rsp),%xmm0 - movdqu %xmm15,0(%r13) - movdqu %xmm0,16(%r13) - leaq 32(%r13),%r13 - - movdqa 32(%rsp),%xmm6 - jmp L$xts_dec_done -.p2align 4 -L$xts_dec_1: - pxor %xmm15,%xmm7 - leaq 16(%r12),%r12 - movdqa %xmm7,32(%rbp) - leaq 32(%rbp),%rdi - leaq 32(%rbp),%rsi - leaq (%r15),%rdx - call _aes_nohw_decrypt - pxor 32(%rbp),%xmm15 - - - - - - movdqu %xmm15,0(%r13) - leaq 16(%r13),%r13 - - movdqa 16(%rsp),%xmm6 - -L$xts_dec_done: - andl $15,%ebx - jz L$xts_dec_ret - - pxor %xmm14,%xmm14 - movdqa L$xts_magic(%rip),%xmm12 - pcmpgtd %xmm6,%xmm14 - pshufd $0x13,%xmm14,%xmm13 - movdqa %xmm6,%xmm5 - paddq %xmm6,%xmm6 - pand %xmm12,%xmm13 - movdqu (%r12),%xmm15 - pxor %xmm13,%xmm6 - - leaq 32(%rbp),%rdi - pxor %xmm6,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _aes_nohw_decrypt - pxor 32(%rbp),%xmm6 - movq %r13,%rdx - movdqu %xmm6,(%r13) - -L$xts_dec_steal: - movzbl 16(%r12),%eax - movzbl (%rdx),%ecx - leaq 1(%r12),%r12 - movb %al,(%rdx) - movb %cl,16(%rdx) - leaq 1(%rdx),%rdx - subl $1,%ebx - jnz L$xts_dec_steal - - movdqu (%r13),%xmm15 - leaq 32(%rbp),%rdi - pxor %xmm5,%xmm15 - leaq 32(%rbp),%rsi - movdqa %xmm15,32(%rbp) - leaq (%r15),%rdx - call _aes_nohw_decrypt - pxor 32(%rbp),%xmm5 - movdqu %xmm5,(%r13) - -L$xts_dec_ret: - leaq (%rsp),%rax - pxor %xmm0,%xmm0 -L$xts_dec_bzero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - leaq 32(%rax),%rax - cmpq %rax,%rbp - ja L$xts_dec_bzero - - leaq 120(%rbp),%rax - - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbx - - movq -8(%rax),%rbp - - leaq (%rax),%rsp - -L$xts_dec_epilogue: - .byte 0xf3,0xc3 - - .p2align 6 _bsaes_const: diff --git a/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S index 8a6d16cd..c550e794 100644 --- a/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ b/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -24,8 +24,6 @@ _sha512_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d - testl $2048,%r10d - jnz L$xop_shortcut andl $1073741824,%r9d andl $268435968,%r10d orl %r9d,%r10d @@ -1826,1107 +1824,6 @@ K512: .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 -sha512_block_data_order_xop: - -L$xop_shortcut: - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) - -L$prologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop_xop -.p2align 4 -L$loop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp L$xop_00_47 - -.p2align 4 -L$xop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne L$xop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop_xop - - movq 152(%rsp),%rsi - - vzeroupper - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue_xop: - .byte 0xf3,0xc3 - - - -.p2align 6 sha512_block_data_order_avx: L$avx_shortcut: @@ -120,6 +120,7 @@ cc_defaults { "src/crypto/fipsmodule/bcm.c", "src/crypto/fipsmodule/is_fips.c", "src/crypto/hkdf/hkdf.c", + "src/crypto/hrss/hrss.c", "src/crypto/lhash/lhash.c", "src/crypto/mem.c", "src/crypto/obj/obj.c", @@ -305,6 +306,7 @@ cc_defaults { "linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S", "linux-x86_64/crypto/fipsmodule/x86_64-mont.S", "linux-x86_64/crypto/fipsmodule/x86_64-mont5.S", + "src/crypto/hrss/asm/poly_rq_mul.S", ], }, }, @@ -420,6 +422,7 @@ cc_defaults { "src/crypto/fipsmodule/rand/ctrdrbg_test.cc", "src/crypto/hkdf/hkdf_test.cc", "src/crypto/hmac_extra/hmac_test.cc", + "src/crypto/hrss/hrss_test.cc", "src/crypto/lhash/lhash_test.cc", "src/crypto/obj/obj_test.cc", "src/crypto/pem/pem_test.cc", @@ -118,6 +118,7 @@ crypto_sources := \ src/crypto/fipsmodule/bcm.c\ src/crypto/fipsmodule/is_fips.c\ src/crypto/hkdf/hkdf.c\ + src/crypto/hrss/hrss.c\ src/crypto/lhash/lhash.c\ src/crypto/mem.c\ src/crypto/obj/obj.c\ @@ -299,4 +300,5 @@ linux_x86_64_sources := \ linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S\ linux-x86_64/crypto/fipsmodule/x86_64-mont.S\ linux-x86_64/crypto/fipsmodule/x86_64-mont5.S\ + src/crypto/hrss/asm/poly_rq_mul.S\ diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt index b1ca70e1..bf696493 100644 --- a/src/crypto/CMakeLists.txt +++ b/src/crypto/CMakeLists.txt @@ -131,6 +131,7 @@ if(${ARCH} STREQUAL "x86_64") chacha/chacha-x86_64.${ASM_EXT} cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT} cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT} + hrss/asm/poly_rq_mul.S ) endif() @@ -275,6 +276,7 @@ add_library( evp/sign.c ex_data.c hkdf/hkdf.c + hrss/hrss.c lhash/lhash.c mem.c obj/obj.c @@ -455,6 +457,7 @@ add_executable( fipsmodule/rand/ctrdrbg_test.cc hkdf/hkdf_test.cc hmac_extra/hmac_test.cc + hrss/hrss_test.cc lhash/lhash_test.cc obj/obj_test.cc pem/pem_test.cc diff --git a/src/crypto/bio/bio.c b/src/crypto/bio/bio.c index fe40578b..7d97c3e7 100644 --- a/src/crypto/bio/bio.c +++ b/src/crypto/bio/bio.c @@ -482,6 +482,31 @@ static int bio_read_all(BIO *bio, uint8_t **out, size_t *out_len, } } +// bio_read_full reads |len| bytes |bio| and writes them into |out|. It +// tolerates partial reads from |bio| and returns one on success or zero if a +// read fails before |len| bytes are read. On failure, it additionally sets +// |*out_eof_on_first_read| to whether the error was due to |bio| returning zero +// on the first read. |out_eof_on_first_read| may be NULL to discard the value. +static int bio_read_full(BIO *bio, uint8_t *out, int *out_eof_on_first_read, + size_t len) { + int first_read = 1; + while (len > 0) { + int todo = len <= INT_MAX ? (int)len : INT_MAX; + int ret = BIO_read(bio, out, todo); + if (ret <= 0) { + if (out_eof_on_first_read != NULL) { + *out_eof_on_first_read = first_read && ret == 0; + } + return 0; + } + out += ret; + len -= (size_t)ret; + first_read = 0; + } + + return 1; +} + // For compatibility with existing |d2i_*_bio| callers, |BIO_read_asn1| uses // |ERR_LIB_ASN1| errors. OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_DECODE_ERROR) @@ -493,17 +518,16 @@ int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) { uint8_t header[6]; static const size_t kInitialHeaderLen = 2; - int ret = BIO_read(bio, header, kInitialHeaderLen); - if (ret == 0) { - // Historically, OpenSSL returned |ASN1_R_HEADER_TOO_LONG| when |d2i_*_bio| - // could not read anything. CPython conditions on this to determine if |bio| - // was empty. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); - return 0; - } - - if (ret != (int) kInitialHeaderLen) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); + int eof_on_first_read; + if (!bio_read_full(bio, header, &eof_on_first_read, kInitialHeaderLen)) { + if (eof_on_first_read) { + // Historically, OpenSSL returned |ASN1_R_HEADER_TOO_LONG| when + // |d2i_*_bio| could not read anything. CPython conditions on this to + // determine if |bio| was empty. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); + } else { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); + } return 0; } @@ -539,8 +563,7 @@ int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) { return 0; } - if (BIO_read(bio, header + kInitialHeaderLen, num_bytes) != - (int)num_bytes) { + if (!bio_read_full(bio, header + kInitialHeaderLen, NULL, num_bytes)) { OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); return 0; } @@ -582,8 +605,7 @@ int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) { return 0; } OPENSSL_memcpy(*out, header, header_len); - if (BIO_read(bio, (*out) + header_len, len - header_len) != - (int) (len - header_len)) { + if (!bio_read_full(bio, (*out) + header_len, NULL, len - header_len)) { OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); OPENSSL_free(*out); return 0; diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c index 1deb9181..71a71fac 100644 --- a/src/crypto/cipher_extra/e_aesgcmsiv.c +++ b/src/crypto/cipher_extra/e_aesgcmsiv.c @@ -27,7 +27,11 @@ #define EVP_AEAD_AES_GCM_SIV_NONCE_LEN 12 #define EVP_AEAD_AES_GCM_SIV_TAG_LEN 16 -#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) +// TODO(davidben): AES-GCM-SIV assembly is not correct for Windows. It must save +// and restore xmm6 through xmm15. +#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \ + !defined(OPENSSL_WINDOWS) +#define AES_GCM_SIV_ASM // Optimised AES-GCM-SIV @@ -60,10 +64,10 @@ static struct aead_aes_gcm_siv_asm_ctx *asm_ctx_from_ctx( extern void aes128gcmsiv_aes_ks( const uint8_t key[16], uint8_t out_expanded_key[16*15]); -// aes128gcmsiv_aes_ks writes an AES-128 key schedule for |key| to +// aes256gcmsiv_aes_ks writes an AES-256 key schedule for |key| to // |out_expanded_key|. extern void aes256gcmsiv_aes_ks( - const uint8_t key[16], uint8_t out_expanded_key[16*15]); + const uint8_t key[32], uint8_t out_expanded_key[16*15]); static int aead_aes_gcm_siv_asm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len) { @@ -549,7 +553,7 @@ static const EVP_AEAD aead_aes_256_gcm_siv_asm = { NULL /* tag_len */, }; -#endif // X86_64 && !NO_ASM +#endif // X86_64 && !NO_ASM && !WINDOWS struct aead_aes_gcm_siv_ctx { union { @@ -838,7 +842,7 @@ static const EVP_AEAD aead_aes_256_gcm_siv = { NULL /* tag_len */, }; -#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) +#if defined(AES_GCM_SIV_ASM) static char avx_aesni_capable(void) { const uint32_t ecx = OPENSSL_ia32cap_P[1]; @@ -871,4 +875,4 @@ const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) { return &aead_aes_256_gcm_siv; } -#endif // X86_64 && !NO_ASM +#endif // AES_GCM_SIV_ASM diff --git a/src/crypto/cpu-intel.c b/src/crypto/cpu-intel.c index 20cfbe8c..98d8d4e5 100644 --- a/src/crypto/cpu-intel.c +++ b/src/crypto/cpu-intel.c @@ -148,23 +148,6 @@ void OPENSSL_cpuid_setup(void) { int is_intel = ebx == 0x756e6547 /* Genu */ && edx == 0x49656e69 /* ineI */ && ecx == 0x6c65746e /* ntel */; - int is_amd = ebx == 0x68747541 /* Auth */ && - edx == 0x69746e65 /* enti */ && - ecx == 0x444d4163 /* cAMD */; - - int has_amd_xop = 0; - if (is_amd) { - // AMD-specific logic. - // See http://developer.amd.com/wordpress/media/2012/10/254811.pdf - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000000); - uint32_t num_extended_ids = eax; - if (num_extended_ids >= 0x80000001) { - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0x80000001); - if (ecx & (1u << 11)) { - has_amd_xop = 1; - } - } - } uint32_t extended_features[2] = {0}; if (num_ids >= 7) { @@ -173,29 +156,11 @@ void OPENSSL_cpuid_setup(void) { extended_features[1] = ecx; } - // Determine the number of cores sharing an L1 data cache to adjust the - // hyper-threading bit. - uint32_t cores_per_cache = 0; - if (is_amd) { - // AMD CPUs never share an L1 data cache between threads but do set the HTT - // bit on multi-core CPUs. - cores_per_cache = 1; - } else if (num_ids >= 4) { - // TODO(davidben): The Intel manual says this CPUID leaf enumerates all - // caches using ECX and doesn't say which is first. Does this matter? - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 4); - cores_per_cache = 1 + ((eax >> 14) & 0xfff); - } - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); - // Adjust the hyper-threading bit. - if (edx & (1u << 28)) { - uint32_t num_logical_cores = (ebx >> 16) & 0xff; - if (cores_per_cache == 1 || num_logical_cores <= 1) { - edx &= ~(1u << 28); - } - } + // Force the hyper-threading bit so that the more conservative path is always + // chosen. + edx |= 1u << 28; // Reserved bit #20 was historically repurposed to control the in-memory // representation of RC4 state. Always set it to zero. @@ -216,12 +181,9 @@ void OPENSSL_cpuid_setup(void) { edx &= ~(1u << 30); } - // The SDBG bit is repurposed to denote AMD XOP support. - if (has_amd_xop) { - ecx |= (1u << 11); - } else { - ecx &= ~(1u << 11); - } + // The SDBG bit is repurposed to denote AMD XOP support. Don't ever use AMD + // XOP code paths. + ecx &= ~(1u << 11); uint64_t xcr0 = 0; if (ecx & (1u << 27)) { diff --git a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl index 11b7a9d6..c0ade374 100644 --- a/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl +++ b/src/crypto/fipsmodule/aes/asm/bsaes-x86_64.pl @@ -114,6 +114,7 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... +my $xts=0; # Also patch out the XTS subroutines. { my ($key,$rounds,$const)=("%rax","%r10d","%r11"); @@ -2163,6 +2164,8 @@ ___ # const AES_KEY *key1, const AES_KEY *key2, # const unsigned char iv[16]); # +# We patch out the XTS implementation in BoringSSL. +if ($xts) { my ($twmask,$twres,$twtmp)=@XMM[13..15]; $arg6=~s/d$//; @@ -2991,6 +2994,7 @@ $code.=<<___; .size bsaes_xts_decrypt,.-bsaes_xts_decrypt ___ } +} # $xts $code.=<<___; .type _bsaes_const,\@object .align 64 @@ -3172,7 +3176,8 @@ $code.=<<___; .rva .Lctr_enc_prologue .rva .Lctr_enc_epilogue .rva .Lctr_enc_info - +___ +$code.=<<___ if ($xts); .rva .Lxts_enc_prologue .rva .Lxts_enc_epilogue .rva .Lxts_enc_info @@ -3180,6 +3185,8 @@ $code.=<<___; .rva .Lxts_dec_prologue .rva .Lxts_dec_epilogue .rva .Lxts_dec_info +___ +$code.=<<___; .section .xdata .align 8 @@ -3211,6 +3218,8 @@ $code.=<<___; .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] .rva .Lctr_enc_tail .long 0 +___ +$code.=<<___ if ($xts); .Lxts_enc_info: .byte 9,0,0,0 .rva se_handler diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h index a9f8a8c7..5f9ee312 100644 --- a/src/crypto/fipsmodule/aes/internal.h +++ b/src/crypto/fipsmodule/aes/internal.h @@ -31,7 +31,7 @@ extern "C" { #define HWAES_ECB static int hwaes_capable(void) { - return (OPENSSL_ia32cap_P[1] & (1 << (57 - 32))) != 0; + return (OPENSSL_ia32cap_get()[1] & (1 << (57 - 32))) != 0; } #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) #define HWAES diff --git a/src/crypto/fipsmodule/cipher/e_aes.c b/src/crypto/fipsmodule/cipher/e_aes.c index 068465b3..2ccec442 100644 --- a/src/crypto/fipsmodule/cipher/e_aes.c +++ b/src/crypto/fipsmodule/cipher/e_aes.c @@ -102,7 +102,7 @@ typedef struct { (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) #define VPAES static char vpaes_capable(void) { - return (OPENSSL_ia32cap_P[1] & (1 << (41 - 32))) != 0; + return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0; } #if defined(OPENSSL_X86_64) diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c index b3422149..ef1ccef7 100644 --- a/src/crypto/fipsmodule/ec/p256-x86_64.c +++ b/src/crypto/fipsmodule/ec/p256-x86_64.c @@ -581,7 +581,7 @@ static void ecp_nistz256_inv_mod_ord(const EC_GROUP *group, EC_SCALAR *out, static int ecp_nistz256_mont_inv_mod_ord_vartime(const EC_GROUP *group, EC_SCALAR *out, const EC_SCALAR *in) { - if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) { + if ((OPENSSL_ia32cap_get()[1] & (1 << 28)) == 0) { // No AVX support; fallback to generic code. return ec_GFp_simple_mont_inv_mod_ord_vartime(group, out, in); } diff --git a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl index 9d53ec47..49278506 100755 --- a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl +++ b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl @@ -108,6 +108,8 @@ # part, body_00_15; reducing the amount of SIMD instructions # below certain limit makes no difference/sense; to conserve # space SHA256 XOP code path is therefore omitted; +# +# Modified from upstream OpenSSL to remove the XOP code. $flavour = shift; $output = shift; @@ -275,9 +277,7 @@ $code.=<<___ if ($SZ==4 && $shaext); test \$`1<<29`,%r11d # check for SHA jnz _shaext_shortcut ___ -$code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r10d # check for XOP - jnz .Lxop_shortcut + # XOP codepath removed. ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 @@ -1127,399 +1127,6 @@ ___ if ($avx) {{ ###################################################################### -# XOP code path -# -if ($SZ==8) { # SHA512 only -$code.=<<___; -.type ${func}_xop,\@function,3 -.align 64 -${func}_xop: -.cfi_startproc -.Lxop_shortcut: - mov %rsp,%rax # copy %rsp -.cfi_def_cfa_register %rax - push %rbx -.cfi_push %rbx - push %rbp -.cfi_push %rbp - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - shl \$4,%rdx # num*16 - sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - and \$-64,%rsp # align stack frame - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %rax,$_rsp # save copy of %rsp -.cfi_cfa_expression $_rsp,deref,+8 -___ -$code.=<<___ if ($win64); - movaps %xmm6,16*$SZ+32(%rsp) - movaps %xmm7,16*$SZ+48(%rsp) - movaps %xmm8,16*$SZ+64(%rsp) - movaps %xmm9,16*$SZ+80(%rsp) -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps %xmm10,16*$SZ+96(%rsp) - movaps %xmm11,16*$SZ+112(%rsp) -___ -$code.=<<___; -.Lprologue_xop: - - vzeroupper - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H - jmp .Lloop_xop -___ - if ($SZ==4) { # SHA256 - my @X = map("%xmm$_",(0..3)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); - -$code.=<<___; -.align 16 -.Lloop_xop: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[0],@X[0] - lea $TABLE(%rip),$Tbl - vpshufb $t3,@X[1],@X[1] - vpshufb $t3,@X[2],@X[2] - vpaddd 0x00($Tbl),@X[0],$t0 - vpshufb $t3,@X[3],@X[3] - vpaddd 0x20($Tbl),@X[1],$t1 - vpaddd 0x40($Tbl),@X[2],$t2 - vpaddd 0x60($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - mov $A,$a1 - vmovdqa $t1,0x10(%rsp) - mov $B,$a3 - vmovdqa $t2,0x20(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x30(%rsp) - mov $E,$a0 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - sub \$`-16*2*$SZ`,$Tbl # size optimization -___ -sub XOP_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 104 instructions - - &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t0,$t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t2,@X[3],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrldq ($t3,$t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t2,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpslldq ($t3,$t3,8); # 22 instructions - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &XOP_256_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); - &jne (".Lxop_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } - - } else { # SHA512 - my @X = map("%xmm$_",(0..7)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); - -$code.=<<___; -.align 16 -.Lloop_xop: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - lea $TABLE+0x80(%rip),$Tbl # size optimization - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vpshufb $t3,@X[0],@X[0] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[1],@X[1] - vmovdqu 0x40($inp),@X[4] - vpshufb $t3,@X[2],@X[2] - vmovdqu 0x50($inp),@X[5] - vpshufb $t3,@X[3],@X[3] - vmovdqu 0x60($inp),@X[6] - vpshufb $t3,@X[4],@X[4] - vmovdqu 0x70($inp),@X[7] - vpshufb $t3,@X[5],@X[5] - vpaddq -0x80($Tbl),@X[0],$t0 - vpshufb $t3,@X[6],@X[6] - vpaddq -0x60($Tbl),@X[1],$t1 - vpshufb $t3,@X[7],@X[7] - vpaddq -0x40($Tbl),@X[2],$t2 - vpaddq -0x20($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - vpaddq 0x00($Tbl),@X[4],$t0 - vmovdqa $t1,0x10(%rsp) - vpaddq 0x20($Tbl),@X[5],$t1 - vmovdqa $t2,0x20(%rsp) - vpaddq 0x40($Tbl),@X[6],$t2 - vmovdqa $t3,0x30(%rsp) - vpaddq 0x60($Tbl),@X[7],$t3 - vmovdqa $t0,0x40(%rsp) - mov $A,$a1 - vmovdqa $t1,0x50(%rsp) - mov $B,$a3 - vmovdqa $t2,0x60(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x70(%rsp) - mov $E,$a0 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - add \$`16*2*$SZ`,$Tbl -___ -sub XOP_512_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body); # 52 instructions - - &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrlq ($t0,$t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrlq ($t2,@X[7],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<8; $j++) { - &XOP_512_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); - &jne (".Lxop_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } -} -$code.=<<___; - mov $_ctx,$ctx - mov $a1,$A - - add $SZ*0($ctx),$A - lea 16*$SZ($inp),$inp - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop_xop - - mov $_rsp,%rsi -.cfi_def_cfa %rsi,8 - vzeroupper -___ -$code.=<<___ if ($win64); - movaps 16*$SZ+32(%rsp),%xmm6 - movaps 16*$SZ+48(%rsp),%xmm7 - movaps 16*$SZ+64(%rsp),%xmm8 - movaps 16*$SZ+80(%rsp),%xmm9 -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps 16*$SZ+96(%rsp),%xmm10 - movaps 16*$SZ+112(%rsp),%xmm11 -___ -$code.=<<___; - mov -48(%rsi),%r15 -.cfi_restore %r15 - mov -40(%rsi),%r14 -.cfi_restore %r14 - mov -32(%rsi),%r13 -.cfi_restore %r13 - mov -24(%rsi),%r12 -.cfi_restore %r12 - mov -16(%rsi),%rbp -.cfi_restore %rbp - mov -8(%rsi),%rbx -.cfi_restore %rbx - lea (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_xop: - ret -.cfi_endproc -.size ${func}_xop,.-${func}_xop -___ -} -###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; @@ -2409,11 +2016,6 @@ $code.=<<___ if ($SZ==4); .rva .LSEH_end_${func}_ssse3 .rva .LSEH_info_${func}_ssse3 ___ -$code.=<<___ if ($avx && $SZ==8); - .rva .LSEH_begin_${func}_xop - .rva .LSEH_end_${func}_xop - .rva .LSEH_info_${func}_xop -___ $code.=<<___ if ($avx); .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx @@ -2443,12 +2045,6 @@ $code.=<<___ if ($SZ==4); .rva se_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ -$code.=<<___ if ($avx && $SZ==8); -.LSEH_info_${func}_xop: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] -___ $code.=<<___ if ($avx); .LSEH_info_${func}_avx: .byte 9,0,0,0 diff --git a/src/crypto/hrss/asm/poly_rq_mul.S b/src/crypto/hrss/asm/poly_rq_mul.S new file mode 100644 index 00000000..0ad0fb51 --- /dev/null +++ b/src/crypto/hrss/asm/poly_rq_mul.S @@ -0,0 +1,8457 @@ +// Copyright (c) 2017, the HRSS authors. +// +// Permission to use, copy, modify, and/or distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && defined(__linux__) + +// This is the polynomial multiplication function from [HRSS], provided by kind +// permission of the authors. +// +// HRSS: https://eprint.iacr.org/2017/1005 + +# This file was generated by poly_rq_mul.py +.text +.align 32 +mask_low9words: +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0xffff +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +.word 0x0 +const3: +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +.word 3 +const9: +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +.word 9 +const0: +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +const729: +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +.word 729 +const3_inv: +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +.word 43691 +const5_inv: +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +.word 52429 +shuf48_16: +.byte 10 +.byte 11 +.byte 12 +.byte 13 +.byte 14 +.byte 15 +.byte 0 +.byte 1 +.byte 2 +.byte 3 +.byte 4 +.byte 5 +.byte 6 +.byte 7 +.byte 8 +.byte 9 +.byte 10 +.byte 11 +.byte 12 +.byte 13 +.byte 14 +.byte 15 +.byte 0 +.byte 1 +.byte 2 +.byte 3 +.byte 4 +.byte 5 +.byte 6 +.byte 7 +.byte 8 +.byte 9 +shufmin1_mask3: +.byte 2 +.byte 3 +.byte 4 +.byte 5 +.byte 6 +.byte 7 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +.byte 255 +mask32_to_16: +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +.word 0xffff +.word 0x0 +mask5_3_5_3: +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +mask3_5_3_5: +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +mask3_5_4_3_1: +.word 65535 +.word 65535 +.word 65535 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 0 +mask_keephigh: +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 0 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +.word 65535 +mask_mod8192: +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.word 8191 +.text +.global poly_Rq_mul +.hidden poly_Rq_mul +.att_syntax prefix +poly_Rq_mul: +.cfi_startproc +push %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp, -16 +movq %rsp, %rbp +.cfi_def_cfa_register rbp +push %r12 +.cfi_offset r12, -24 +mov %rsp, %r8 +andq $-32, %rsp +subq $6144, %rsp +mov %rsp, %rax +subq $6144, %rsp +mov %rsp, %r11 +subq $12288, %rsp +mov %rsp, %r12 +subq $512, %rsp +vmovdqa const3(%rip), %ymm3 +vmovdqu 0(%rsi), %ymm0 +vmovdqu 88(%rsi), %ymm1 +vmovdqu 176(%rsi), %ymm2 +vmovdqu 264(%rsi), %ymm12 +vmovdqu 1056(%rsi), %ymm4 +vmovdqu 1144(%rsi), %ymm5 +vmovdqu 1232(%rsi), %ymm6 +vmovdqu 1320(%rsi), %ymm7 +vmovdqu 352(%rsi), %ymm8 +vmovdqu 440(%rsi), %ymm9 +vmovdqu 528(%rsi), %ymm10 +vmovdqu 616(%rsi), %ymm11 +vmovdqa %ymm0, 0(%rax) +vmovdqa %ymm1, 96(%rax) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 192(%rax) +vmovdqa %ymm2, 288(%rax) +vmovdqa %ymm12, 384(%rax) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 480(%rax) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 576(%rax) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 672(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 768(%rax) +vmovdqa %ymm4, 5184(%rax) +vmovdqa %ymm5, 5280(%rax) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5376(%rax) +vmovdqa %ymm6, 5472(%rax) +vmovdqa %ymm7, 5568(%rax) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5664(%rax) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5760(%rax) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5856(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5952(%rax) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 704(%rsi), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 792(%rsi), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 880(%rsi), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 968(%rsi), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 864(%rax) +vmovdqa %ymm9, 960(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1056(%rax) +vmovdqa %ymm10, 1152(%rax) +vmovdqa %ymm11, 1248(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1344(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1440(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1536(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1632(%rax) +vmovdqa %ymm12, 1728(%rax) +vmovdqa %ymm13, 1824(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1920(%rax) +vmovdqa %ymm14, 2016(%rax) +vmovdqa %ymm15, 2112(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2208(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2304(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2400(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2496(%rax) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2592(%rax) +vmovdqa %ymm9, 2688(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2784(%rax) +vmovdqa %ymm10, 2880(%rax) +vmovdqa %ymm11, 2976(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3072(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3168(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3264(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3360(%rax) +vmovdqa %ymm12, 3456(%rax) +vmovdqa %ymm13, 3552(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3648(%rax) +vmovdqa %ymm14, 3744(%rax) +vmovdqa %ymm15, 3840(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3936(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4032(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4128(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4224(%rax) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4320(%rax) +vmovdqa %ymm13, 4416(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4512(%rax) +vmovdqa %ymm14, 4608(%rax) +vmovdqa %ymm15, 4704(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4800(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4896(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4992(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5088(%rax) +vmovdqu 32(%rsi), %ymm0 +vmovdqu 120(%rsi), %ymm1 +vmovdqu 208(%rsi), %ymm2 +vmovdqu 296(%rsi), %ymm12 +vmovdqu 1088(%rsi), %ymm4 +vmovdqu 1176(%rsi), %ymm5 +vmovdqu 1264(%rsi), %ymm6 +vmovdqu 1352(%rsi), %ymm7 +vmovdqu 384(%rsi), %ymm8 +vmovdqu 472(%rsi), %ymm9 +vmovdqu 560(%rsi), %ymm10 +vmovdqu 648(%rsi), %ymm11 +vmovdqa %ymm0, 32(%rax) +vmovdqa %ymm1, 128(%rax) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 224(%rax) +vmovdqa %ymm2, 320(%rax) +vmovdqa %ymm12, 416(%rax) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 512(%rax) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 608(%rax) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 704(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 800(%rax) +vmovdqa %ymm4, 5216(%rax) +vmovdqa %ymm5, 5312(%rax) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5408(%rax) +vmovdqa %ymm6, 5504(%rax) +vmovdqa %ymm7, 5600(%rax) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5696(%rax) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5792(%rax) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5888(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5984(%rax) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 736(%rsi), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 824(%rsi), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 912(%rsi), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1000(%rsi), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 896(%rax) +vmovdqa %ymm9, 992(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1088(%rax) +vmovdqa %ymm10, 1184(%rax) +vmovdqa %ymm11, 1280(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1376(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1472(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1568(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1664(%rax) +vmovdqa %ymm12, 1760(%rax) +vmovdqa %ymm13, 1856(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1952(%rax) +vmovdqa %ymm14, 2048(%rax) +vmovdqa %ymm15, 2144(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2240(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2336(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2432(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2528(%rax) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2624(%rax) +vmovdqa %ymm9, 2720(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2816(%rax) +vmovdqa %ymm10, 2912(%rax) +vmovdqa %ymm11, 3008(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3104(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3200(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3296(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3392(%rax) +vmovdqa %ymm12, 3488(%rax) +vmovdqa %ymm13, 3584(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3680(%rax) +vmovdqa %ymm14, 3776(%rax) +vmovdqa %ymm15, 3872(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3968(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4064(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4160(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4256(%rax) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4352(%rax) +vmovdqa %ymm13, 4448(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4544(%rax) +vmovdqa %ymm14, 4640(%rax) +vmovdqa %ymm15, 4736(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4832(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4928(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5024(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5120(%rax) +vmovdqu 64(%rsi), %ymm0 +vmovdqu 152(%rsi), %ymm1 +vmovdqu 240(%rsi), %ymm2 +vmovdqu 328(%rsi), %ymm12 +vmovdqu 1120(%rsi), %ymm4 +vmovdqu 1208(%rsi), %ymm5 +vmovdqu 1296(%rsi), %ymm6 +vmovdqu 1384(%rsi), %ymm7 +vpand mask_low9words(%rip), %ymm7, %ymm7 +vmovdqu 416(%rsi), %ymm8 +vmovdqu 504(%rsi), %ymm9 +vmovdqu 592(%rsi), %ymm10 +vmovdqu 680(%rsi), %ymm11 +vmovdqa %ymm0, 64(%rax) +vmovdqa %ymm1, 160(%rax) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 256(%rax) +vmovdqa %ymm2, 352(%rax) +vmovdqa %ymm12, 448(%rax) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 544(%rax) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 640(%rax) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 736(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 832(%rax) +vmovdqa %ymm4, 5248(%rax) +vmovdqa %ymm5, 5344(%rax) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5440(%rax) +vmovdqa %ymm6, 5536(%rax) +vmovdqa %ymm7, 5632(%rax) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5728(%rax) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5824(%rax) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5920(%rax) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 6016(%rax) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 768(%rsi), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 856(%rsi), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 944(%rsi), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1032(%rsi), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 928(%rax) +vmovdqa %ymm9, 1024(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1120(%rax) +vmovdqa %ymm10, 1216(%rax) +vmovdqa %ymm11, 1312(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1408(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1504(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1600(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1696(%rax) +vmovdqa %ymm12, 1792(%rax) +vmovdqa %ymm13, 1888(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1984(%rax) +vmovdqa %ymm14, 2080(%rax) +vmovdqa %ymm15, 2176(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2272(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2368(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2464(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2560(%rax) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2656(%rax) +vmovdqa %ymm9, 2752(%rax) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2848(%rax) +vmovdqa %ymm10, 2944(%rax) +vmovdqa %ymm11, 3040(%rax) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3136(%rax) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3232(%rax) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3328(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3424(%rax) +vmovdqa %ymm12, 3520(%rax) +vmovdqa %ymm13, 3616(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3712(%rax) +vmovdqa %ymm14, 3808(%rax) +vmovdqa %ymm15, 3904(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4000(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4096(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4192(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4288(%rax) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4384(%rax) +vmovdqa %ymm13, 4480(%rax) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4576(%rax) +vmovdqa %ymm14, 4672(%rax) +vmovdqa %ymm15, 4768(%rax) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4864(%rax) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4960(%rax) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5056(%rax) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5152(%rax) +vmovdqu 0(%rdx), %ymm0 +vmovdqu 88(%rdx), %ymm1 +vmovdqu 176(%rdx), %ymm2 +vmovdqu 264(%rdx), %ymm12 +vmovdqu 1056(%rdx), %ymm4 +vmovdqu 1144(%rdx), %ymm5 +vmovdqu 1232(%rdx), %ymm6 +vmovdqu 1320(%rdx), %ymm7 +vmovdqu 352(%rdx), %ymm8 +vmovdqu 440(%rdx), %ymm9 +vmovdqu 528(%rdx), %ymm10 +vmovdqu 616(%rdx), %ymm11 +vmovdqa %ymm0, 0(%r11) +vmovdqa %ymm1, 96(%r11) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 192(%r11) +vmovdqa %ymm2, 288(%r11) +vmovdqa %ymm12, 384(%r11) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 480(%r11) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 576(%r11) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 672(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 768(%r11) +vmovdqa %ymm4, 5184(%r11) +vmovdqa %ymm5, 5280(%r11) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5376(%r11) +vmovdqa %ymm6, 5472(%r11) +vmovdqa %ymm7, 5568(%r11) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5664(%r11) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5760(%r11) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5856(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5952(%r11) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 704(%rdx), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 792(%rdx), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 880(%rdx), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 968(%rdx), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 864(%r11) +vmovdqa %ymm9, 960(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1056(%r11) +vmovdqa %ymm10, 1152(%r11) +vmovdqa %ymm11, 1248(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1344(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1440(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1536(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1632(%r11) +vmovdqa %ymm12, 1728(%r11) +vmovdqa %ymm13, 1824(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1920(%r11) +vmovdqa %ymm14, 2016(%r11) +vmovdqa %ymm15, 2112(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2208(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2304(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2400(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2496(%r11) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2592(%r11) +vmovdqa %ymm9, 2688(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2784(%r11) +vmovdqa %ymm10, 2880(%r11) +vmovdqa %ymm11, 2976(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3072(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3168(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3264(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3360(%r11) +vmovdqa %ymm12, 3456(%r11) +vmovdqa %ymm13, 3552(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3648(%r11) +vmovdqa %ymm14, 3744(%r11) +vmovdqa %ymm15, 3840(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3936(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4032(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4128(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4224(%r11) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4320(%r11) +vmovdqa %ymm13, 4416(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4512(%r11) +vmovdqa %ymm14, 4608(%r11) +vmovdqa %ymm15, 4704(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4800(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4896(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4992(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5088(%r11) +vmovdqu 32(%rdx), %ymm0 +vmovdqu 120(%rdx), %ymm1 +vmovdqu 208(%rdx), %ymm2 +vmovdqu 296(%rdx), %ymm12 +vmovdqu 1088(%rdx), %ymm4 +vmovdqu 1176(%rdx), %ymm5 +vmovdqu 1264(%rdx), %ymm6 +vmovdqu 1352(%rdx), %ymm7 +vmovdqu 384(%rdx), %ymm8 +vmovdqu 472(%rdx), %ymm9 +vmovdqu 560(%rdx), %ymm10 +vmovdqu 648(%rdx), %ymm11 +vmovdqa %ymm0, 32(%r11) +vmovdqa %ymm1, 128(%r11) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 224(%r11) +vmovdqa %ymm2, 320(%r11) +vmovdqa %ymm12, 416(%r11) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 512(%r11) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 608(%r11) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 704(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 800(%r11) +vmovdqa %ymm4, 5216(%r11) +vmovdqa %ymm5, 5312(%r11) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5408(%r11) +vmovdqa %ymm6, 5504(%r11) +vmovdqa %ymm7, 5600(%r11) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5696(%r11) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5792(%r11) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5888(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 5984(%r11) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 736(%rdx), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 824(%rdx), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 912(%rdx), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1000(%rdx), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 896(%r11) +vmovdqa %ymm9, 992(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1088(%r11) +vmovdqa %ymm10, 1184(%r11) +vmovdqa %ymm11, 1280(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1376(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1472(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1568(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1664(%r11) +vmovdqa %ymm12, 1760(%r11) +vmovdqa %ymm13, 1856(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1952(%r11) +vmovdqa %ymm14, 2048(%r11) +vmovdqa %ymm15, 2144(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2240(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2336(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2432(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2528(%r11) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2624(%r11) +vmovdqa %ymm9, 2720(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2816(%r11) +vmovdqa %ymm10, 2912(%r11) +vmovdqa %ymm11, 3008(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3104(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3200(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3296(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3392(%r11) +vmovdqa %ymm12, 3488(%r11) +vmovdqa %ymm13, 3584(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3680(%r11) +vmovdqa %ymm14, 3776(%r11) +vmovdqa %ymm15, 3872(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 3968(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4064(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4160(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4256(%r11) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4352(%r11) +vmovdqa %ymm13, 4448(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4544(%r11) +vmovdqa %ymm14, 4640(%r11) +vmovdqa %ymm15, 4736(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4832(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4928(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5024(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5120(%r11) +vmovdqu 64(%rdx), %ymm0 +vmovdqu 152(%rdx), %ymm1 +vmovdqu 240(%rdx), %ymm2 +vmovdqu 328(%rdx), %ymm12 +vmovdqu 1120(%rdx), %ymm4 +vmovdqu 1208(%rdx), %ymm5 +vmovdqu 1296(%rdx), %ymm6 +vmovdqu 1384(%rdx), %ymm7 +vpand mask_low9words(%rip), %ymm7, %ymm7 +vmovdqu 416(%rdx), %ymm8 +vmovdqu 504(%rdx), %ymm9 +vmovdqu 592(%rdx), %ymm10 +vmovdqu 680(%rdx), %ymm11 +vmovdqa %ymm0, 64(%r11) +vmovdqa %ymm1, 160(%r11) +vpaddw %ymm0, %ymm1, %ymm14 +vmovdqa %ymm14, 256(%r11) +vmovdqa %ymm2, 352(%r11) +vmovdqa %ymm12, 448(%r11) +vpaddw %ymm2, %ymm12, %ymm14 +vmovdqa %ymm14, 544(%r11) +vpaddw %ymm0, %ymm2, %ymm14 +vmovdqa %ymm14, 640(%r11) +vpaddw %ymm1, %ymm12, %ymm15 +vmovdqa %ymm15, 736(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 832(%r11) +vmovdqa %ymm4, 5248(%r11) +vmovdqa %ymm5, 5344(%r11) +vpaddw %ymm4, %ymm5, %ymm14 +vmovdqa %ymm14, 5440(%r11) +vmovdqa %ymm6, 5536(%r11) +vmovdqa %ymm7, 5632(%r11) +vpaddw %ymm6, %ymm7, %ymm14 +vmovdqa %ymm14, 5728(%r11) +vpaddw %ymm4, %ymm6, %ymm14 +vmovdqa %ymm14, 5824(%r11) +vpaddw %ymm5, %ymm7, %ymm15 +vmovdqa %ymm15, 5920(%r11) +vpaddw %ymm14, %ymm15, %ymm14 +vmovdqa %ymm14, 6016(%r11) +vmovdqa %ymm0, 0(%rsp) +vmovdqa %ymm1, 32(%rsp) +vmovdqa %ymm2, 64(%rsp) +vmovdqa %ymm12, 96(%rsp) +vmovdqa %ymm8, 128(%rsp) +vmovdqa %ymm9, 160(%rsp) +vmovdqa %ymm10, 192(%rsp) +vmovdqa %ymm11, 224(%rsp) +vmovdqu 768(%rdx), %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm1 +vpaddw 128(%rsp), %ymm4, %ymm2 +vpaddw %ymm2, %ymm1, %ymm8 +vpsubw %ymm2, %ymm1, %ymm12 +vmovdqa %ymm0, 256(%rsp) +vmovdqu 856(%rdx), %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm1 +vpaddw 160(%rsp), %ymm5, %ymm2 +vpaddw %ymm2, %ymm1, %ymm9 +vpsubw %ymm2, %ymm1, %ymm13 +vmovdqa %ymm0, 288(%rsp) +vmovdqu 944(%rdx), %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm1 +vpaddw 192(%rsp), %ymm6, %ymm2 +vpaddw %ymm2, %ymm1, %ymm10 +vpsubw %ymm2, %ymm1, %ymm14 +vmovdqa %ymm0, 320(%rsp) +vmovdqu 1032(%rdx), %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm1 +vpaddw 224(%rsp), %ymm7, %ymm2 +vpaddw %ymm2, %ymm1, %ymm11 +vpsubw %ymm2, %ymm1, %ymm15 +vmovdqa %ymm0, 352(%rsp) +vmovdqa %ymm8, 928(%r11) +vmovdqa %ymm9, 1024(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 1120(%r11) +vmovdqa %ymm10, 1216(%r11) +vmovdqa %ymm11, 1312(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 1408(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 1504(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 1600(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 1696(%r11) +vmovdqa %ymm12, 1792(%r11) +vmovdqa %ymm13, 1888(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 1984(%r11) +vmovdqa %ymm14, 2080(%r11) +vmovdqa %ymm15, 2176(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 2272(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 2368(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 2464(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 2560(%r11) +vmovdqa 256(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm4, %ymm1 +vpaddw 128(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm8 +vpsubw %ymm1, %ymm0, %ymm12 +vmovdqa 288(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm5, %ymm1 +vpaddw 160(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm9 +vpsubw %ymm1, %ymm0, %ymm13 +vmovdqa 320(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm6, %ymm1 +vpaddw 192(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm10 +vpsubw %ymm1, %ymm0, %ymm14 +vmovdqa 352(%rsp), %ymm0 +vpsllw $2, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm0 +vpsllw $2, %ymm7, %ymm1 +vpaddw 224(%rsp), %ymm1, %ymm1 +vpsllw $1, %ymm1, %ymm1 +vpaddw %ymm1, %ymm0, %ymm11 +vpsubw %ymm1, %ymm0, %ymm15 +vmovdqa %ymm8, 2656(%r11) +vmovdqa %ymm9, 2752(%r11) +vpaddw %ymm8, %ymm9, %ymm0 +vmovdqa %ymm0, 2848(%r11) +vmovdqa %ymm10, 2944(%r11) +vmovdqa %ymm11, 3040(%r11) +vpaddw %ymm10, %ymm11, %ymm0 +vmovdqa %ymm0, 3136(%r11) +vpaddw %ymm8, %ymm10, %ymm0 +vmovdqa %ymm0, 3232(%r11) +vpaddw %ymm9, %ymm11, %ymm1 +vmovdqa %ymm1, 3328(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 3424(%r11) +vmovdqa %ymm12, 3520(%r11) +vmovdqa %ymm13, 3616(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 3712(%r11) +vmovdqa %ymm14, 3808(%r11) +vmovdqa %ymm15, 3904(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4000(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4096(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 4192(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 4288(%r11) +vpmullw %ymm3, %ymm4, %ymm0 +vpaddw 256(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 128(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 0(%rsp), %ymm0, %ymm12 +vpmullw %ymm3, %ymm5, %ymm0 +vpaddw 288(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 160(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 32(%rsp), %ymm0, %ymm13 +vpmullw %ymm3, %ymm6, %ymm0 +vpaddw 320(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 192(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 64(%rsp), %ymm0, %ymm14 +vpmullw %ymm3, %ymm7, %ymm0 +vpaddw 352(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 224(%rsp), %ymm0, %ymm0 +vpmullw %ymm3, %ymm0, %ymm0 +vpaddw 96(%rsp), %ymm0, %ymm15 +vmovdqa %ymm12, 4384(%r11) +vmovdqa %ymm13, 4480(%r11) +vpaddw %ymm12, %ymm13, %ymm0 +vmovdqa %ymm0, 4576(%r11) +vmovdqa %ymm14, 4672(%r11) +vmovdqa %ymm15, 4768(%r11) +vpaddw %ymm14, %ymm15, %ymm0 +vmovdqa %ymm0, 4864(%r11) +vpaddw %ymm12, %ymm14, %ymm0 +vmovdqa %ymm0, 4960(%r11) +vpaddw %ymm13, %ymm15, %ymm1 +vmovdqa %ymm1, 5056(%r11) +vpaddw %ymm0, %ymm1, %ymm0 +vmovdqa %ymm0, 5152(%r11) +subq $9408, %rsp +mov $4, %ecx +karatsuba_loop_4eced63f144beffcb0247f9c6f67d165: +mov %rsp, %r9 +mov %rsp, %r10 +subq $32, %rsp +vmovdqa 0(%rax), %ymm0 +vmovdqa 192(%rax), %ymm1 +vmovdqa 384(%rax), %ymm2 +vmovdqa 576(%rax), %ymm3 +vpunpcklwd 96(%rax), %ymm0, %ymm4 +vpunpckhwd 96(%rax), %ymm0, %ymm5 +vpunpcklwd 288(%rax), %ymm1, %ymm6 +vpunpckhwd 288(%rax), %ymm1, %ymm7 +vpunpcklwd 480(%rax), %ymm2, %ymm8 +vpunpckhwd 480(%rax), %ymm2, %ymm9 +vpunpcklwd 672(%rax), %ymm3, %ymm10 +vpunpckhwd 672(%rax), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 768(%rax), %ymm0 +vmovdqa 960(%rax), %ymm1 +vmovdqa 1152(%rax), %ymm2 +vmovdqa 1344(%rax), %ymm3 +vpunpcklwd 864(%rax), %ymm0, %ymm12 +vpunpckhwd 864(%rax), %ymm0, %ymm13 +vpunpcklwd 1056(%rax), %ymm1, %ymm14 +vpunpckhwd 1056(%rax), %ymm1, %ymm15 +vpunpcklwd 1248(%rax), %ymm2, %ymm0 +vpunpckhwd 1248(%rax), %ymm2, %ymm1 +vpunpcklwd 1440(%rax), %ymm3, %ymm2 +vpunpckhwd 1440(%rax), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 0(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 32(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 64(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 96(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 128(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 160(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 192(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 256(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 288(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 320(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 352(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 384(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 416(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 448(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 224(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 480(%r9) +vmovdqa 32(%rax), %ymm0 +vmovdqa 224(%rax), %ymm1 +vmovdqa 416(%rax), %ymm2 +vmovdqa 608(%rax), %ymm3 +vpunpcklwd 128(%rax), %ymm0, %ymm4 +vpunpckhwd 128(%rax), %ymm0, %ymm5 +vpunpcklwd 320(%rax), %ymm1, %ymm6 +vpunpckhwd 320(%rax), %ymm1, %ymm7 +vpunpcklwd 512(%rax), %ymm2, %ymm8 +vpunpckhwd 512(%rax), %ymm2, %ymm9 +vpunpcklwd 704(%rax), %ymm3, %ymm10 +vpunpckhwd 704(%rax), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 800(%rax), %ymm0 +vmovdqa 992(%rax), %ymm1 +vmovdqa 1184(%rax), %ymm2 +vmovdqa 1376(%rax), %ymm3 +vpunpcklwd 896(%rax), %ymm0, %ymm12 +vpunpckhwd 896(%rax), %ymm0, %ymm13 +vpunpcklwd 1088(%rax), %ymm1, %ymm14 +vpunpckhwd 1088(%rax), %ymm1, %ymm15 +vpunpcklwd 1280(%rax), %ymm2, %ymm0 +vpunpckhwd 1280(%rax), %ymm2, %ymm1 +vpunpcklwd 1472(%rax), %ymm3, %ymm2 +vpunpckhwd 1472(%rax), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 512(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 544(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 576(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 608(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 640(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 672(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 704(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 768(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 800(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 832(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 864(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 896(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 928(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 960(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 736(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 992(%r9) +vmovdqa 64(%rax), %ymm0 +vmovdqa 256(%rax), %ymm1 +vmovdqa 448(%rax), %ymm2 +vmovdqa 640(%rax), %ymm3 +vpunpcklwd 160(%rax), %ymm0, %ymm4 +vpunpckhwd 160(%rax), %ymm0, %ymm5 +vpunpcklwd 352(%rax), %ymm1, %ymm6 +vpunpckhwd 352(%rax), %ymm1, %ymm7 +vpunpcklwd 544(%rax), %ymm2, %ymm8 +vpunpckhwd 544(%rax), %ymm2, %ymm9 +vpunpcklwd 736(%rax), %ymm3, %ymm10 +vpunpckhwd 736(%rax), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 832(%rax), %ymm0 +vmovdqa 1024(%rax), %ymm1 +vmovdqa 1216(%rax), %ymm2 +vmovdqa 1408(%rax), %ymm3 +vpunpcklwd 928(%rax), %ymm0, %ymm12 +vpunpckhwd 928(%rax), %ymm0, %ymm13 +vpunpcklwd 1120(%rax), %ymm1, %ymm14 +vpunpckhwd 1120(%rax), %ymm1, %ymm15 +vpunpcklwd 1312(%rax), %ymm2, %ymm0 +vpunpckhwd 1312(%rax), %ymm2, %ymm1 +vpunpcklwd 1504(%rax), %ymm3, %ymm2 +vpunpckhwd 1504(%rax), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 1024(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 1056(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 1088(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 1120(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 1152(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1184(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1216(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1280(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1312(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1344(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 1376(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1248(%r9) +addq $32, %rsp +subq $32, %rsp +vmovdqa 0(%r11), %ymm0 +vmovdqa 192(%r11), %ymm1 +vmovdqa 384(%r11), %ymm2 +vmovdqa 576(%r11), %ymm3 +vpunpcklwd 96(%r11), %ymm0, %ymm4 +vpunpckhwd 96(%r11), %ymm0, %ymm5 +vpunpcklwd 288(%r11), %ymm1, %ymm6 +vpunpckhwd 288(%r11), %ymm1, %ymm7 +vpunpcklwd 480(%r11), %ymm2, %ymm8 +vpunpckhwd 480(%r11), %ymm2, %ymm9 +vpunpcklwd 672(%r11), %ymm3, %ymm10 +vpunpckhwd 672(%r11), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 768(%r11), %ymm0 +vmovdqa 960(%r11), %ymm1 +vmovdqa 1152(%r11), %ymm2 +vmovdqa 1344(%r11), %ymm3 +vpunpcklwd 864(%r11), %ymm0, %ymm12 +vpunpckhwd 864(%r11), %ymm0, %ymm13 +vpunpcklwd 1056(%r11), %ymm1, %ymm14 +vpunpckhwd 1056(%r11), %ymm1, %ymm15 +vpunpcklwd 1248(%r11), %ymm2, %ymm0 +vpunpckhwd 1248(%r11), %ymm2, %ymm1 +vpunpcklwd 1440(%r11), %ymm3, %ymm2 +vpunpckhwd 1440(%r11), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 1408(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 1440(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 1472(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 1504(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 1536(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1568(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1600(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1664(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1696(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1728(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 1760(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 1792(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 1824(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 1856(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1632(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 1888(%r9) +vmovdqa 32(%r11), %ymm0 +vmovdqa 224(%r11), %ymm1 +vmovdqa 416(%r11), %ymm2 +vmovdqa 608(%r11), %ymm3 +vpunpcklwd 128(%r11), %ymm0, %ymm4 +vpunpckhwd 128(%r11), %ymm0, %ymm5 +vpunpcklwd 320(%r11), %ymm1, %ymm6 +vpunpckhwd 320(%r11), %ymm1, %ymm7 +vpunpcklwd 512(%r11), %ymm2, %ymm8 +vpunpckhwd 512(%r11), %ymm2, %ymm9 +vpunpcklwd 704(%r11), %ymm3, %ymm10 +vpunpckhwd 704(%r11), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 800(%r11), %ymm0 +vmovdqa 992(%r11), %ymm1 +vmovdqa 1184(%r11), %ymm2 +vmovdqa 1376(%r11), %ymm3 +vpunpcklwd 896(%r11), %ymm0, %ymm12 +vpunpckhwd 896(%r11), %ymm0, %ymm13 +vpunpcklwd 1088(%r11), %ymm1, %ymm14 +vpunpckhwd 1088(%r11), %ymm1, %ymm15 +vpunpcklwd 1280(%r11), %ymm2, %ymm0 +vpunpckhwd 1280(%r11), %ymm2, %ymm1 +vpunpcklwd 1472(%r11), %ymm3, %ymm2 +vpunpckhwd 1472(%r11), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 1920(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 1952(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 1984(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 2016(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 2048(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 2080(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 2112(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 2176(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 2208(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2240(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2272(%r9) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2304(%r9) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2336(%r9) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2368(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 2144(%r9) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2400(%r9) +vmovdqa 64(%r11), %ymm0 +vmovdqa 256(%r11), %ymm1 +vmovdqa 448(%r11), %ymm2 +vmovdqa 640(%r11), %ymm3 +vpunpcklwd 160(%r11), %ymm0, %ymm4 +vpunpckhwd 160(%r11), %ymm0, %ymm5 +vpunpcklwd 352(%r11), %ymm1, %ymm6 +vpunpckhwd 352(%r11), %ymm1, %ymm7 +vpunpcklwd 544(%r11), %ymm2, %ymm8 +vpunpckhwd 544(%r11), %ymm2, %ymm9 +vpunpcklwd 736(%r11), %ymm3, %ymm10 +vpunpckhwd 736(%r11), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 832(%r11), %ymm0 +vmovdqa 1024(%r11), %ymm1 +vmovdqa 1216(%r11), %ymm2 +vmovdqa 1408(%r11), %ymm3 +vpunpcklwd 928(%r11), %ymm0, %ymm12 +vpunpckhwd 928(%r11), %ymm0, %ymm13 +vpunpcklwd 1120(%r11), %ymm1, %ymm14 +vpunpckhwd 1120(%r11), %ymm1, %ymm15 +vpunpcklwd 1312(%r11), %ymm2, %ymm0 +vpunpckhwd 1312(%r11), %ymm2, %ymm1 +vpunpcklwd 1504(%r11), %ymm3, %ymm2 +vpunpckhwd 1504(%r11), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 2432(%r9) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 2464(%r9) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 2496(%r9) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 2528(%r9) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 2560(%r9) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 2592(%r9) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 2624(%r9) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 2688(%r9) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 2720(%r9) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2752(%r9) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2784(%r9) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 2656(%r9) +addq $32, %rsp +innerloop_4eced63f144beffcb0247f9c6f67d165: +vmovdqa 0(%r9), %ymm0 +vmovdqa 1408(%r9), %ymm6 +vmovdqa 32(%r9), %ymm1 +vmovdqa 1440(%r9), %ymm7 +vmovdqa 64(%r9), %ymm2 +vmovdqa 1472(%r9), %ymm8 +vmovdqa 96(%r9), %ymm3 +vmovdqa 1504(%r9), %ymm9 +vmovdqa 128(%r9), %ymm4 +vmovdqa 1536(%r9), %ymm10 +vmovdqa 160(%r9), %ymm5 +vmovdqa 1568(%r9), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 2816(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 2848(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 2880(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 2912(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 2944(%r10) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 2976(%r10) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3008(%r10) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3040(%r10) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3072(%r10) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3104(%r10) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 3136(%r10) +vmovdqa 192(%r9), %ymm0 +vmovdqa 1600(%r9), %ymm6 +vmovdqa 224(%r9), %ymm1 +vmovdqa 1632(%r9), %ymm7 +vmovdqa 256(%r9), %ymm2 +vmovdqa 1664(%r9), %ymm8 +vmovdqa 288(%r9), %ymm3 +vmovdqa 1696(%r9), %ymm9 +vmovdqa 320(%r9), %ymm4 +vmovdqa 1728(%r9), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 3200(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3232(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3264(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3296(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3328(%r10) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3360(%r10) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3392(%r10) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3424(%r10) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 3456(%r10) +vpaddw 0(%r9), %ymm0, %ymm0 +vpaddw 1408(%r9), %ymm6, %ymm6 +vpaddw 32(%r9), %ymm1, %ymm1 +vpaddw 1440(%r9), %ymm7, %ymm7 +vpaddw 64(%r9), %ymm2, %ymm2 +vpaddw 1472(%r9), %ymm8, %ymm8 +vpaddw 96(%r9), %ymm3, %ymm3 +vpaddw 1504(%r9), %ymm9, %ymm9 +vpaddw 128(%r9), %ymm4, %ymm4 +vpaddw 1536(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 2976(%r10), %ymm12, %ymm12 +vpsubw 3360(%r10), %ymm12, %ymm12 +vmovdqa %ymm12, 3168(%r10) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 3008(%r10), %ymm0 +vpsubw 3200(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 3392(%r10), %ymm6, %ymm6 +vmovdqa %ymm6, 3200(%r10) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 2816(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 3008(%r10) +vmovdqa 3040(%r10), %ymm1 +vpsubw 3232(%r10), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 3424(%r10), %ymm7, %ymm7 +vmovdqa %ymm7, 3232(%r10) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 2848(%r10), %ymm1, %ymm1 +vmovdqa %ymm1, 3040(%r10) +vmovdqa 3072(%r10), %ymm2 +vpsubw 3264(%r10), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 3456(%r10), %ymm8, %ymm8 +vmovdqa %ymm8, 3264(%r10) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 2880(%r10), %ymm2, %ymm2 +vmovdqa %ymm2, 3072(%r10) +vmovdqa 3104(%r10), %ymm3 +vpsubw 3296(%r10), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 3296(%r10) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 2912(%r10), %ymm3, %ymm3 +vmovdqa %ymm3, 3104(%r10) +vmovdqa 3136(%r10), %ymm4 +vpsubw 3328(%r10), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 2944(%r10), %ymm4, %ymm4 +vmovdqa %ymm4, 3136(%r10) +vmovdqa 352(%r9), %ymm0 +vmovdqa 1760(%r9), %ymm6 +vmovdqa 384(%r9), %ymm1 +vmovdqa 1792(%r9), %ymm7 +vmovdqa 416(%r9), %ymm2 +vmovdqa 1824(%r9), %ymm8 +vmovdqa 448(%r9), %ymm3 +vmovdqa 1856(%r9), %ymm9 +vmovdqa 480(%r9), %ymm4 +vmovdqa 1888(%r9), %ymm10 +vmovdqa 512(%r9), %ymm5 +vmovdqa 1920(%r9), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 3520(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3552(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3584(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3616(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3648(%r10) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3680(%r10) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3712(%r10) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3744(%r10) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3776(%r10) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3808(%r10) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 3840(%r10) +vmovdqa 544(%r9), %ymm0 +vmovdqa 1952(%r9), %ymm6 +vmovdqa 576(%r9), %ymm1 +vmovdqa 1984(%r9), %ymm7 +vmovdqa 608(%r9), %ymm2 +vmovdqa 2016(%r9), %ymm8 +vmovdqa 640(%r9), %ymm3 +vmovdqa 2048(%r9), %ymm9 +vmovdqa 672(%r9), %ymm4 +vmovdqa 2080(%r9), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 3904(%r10) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 3936(%r10) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 3968(%r10) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 4000(%r10) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 4032(%r10) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 4064(%r10) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 4096(%r10) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 4128(%r10) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 4160(%r10) +vpaddw 352(%r9), %ymm0, %ymm0 +vpaddw 1760(%r9), %ymm6, %ymm6 +vpaddw 384(%r9), %ymm1, %ymm1 +vpaddw 1792(%r9), %ymm7, %ymm7 +vpaddw 416(%r9), %ymm2, %ymm2 +vpaddw 1824(%r9), %ymm8, %ymm8 +vpaddw 448(%r9), %ymm3, %ymm3 +vpaddw 1856(%r9), %ymm9, %ymm9 +vpaddw 480(%r9), %ymm4, %ymm4 +vpaddw 1888(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 3680(%r10), %ymm12, %ymm12 +vpsubw 4064(%r10), %ymm12, %ymm12 +vmovdqa %ymm12, 3872(%r10) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 3712(%r10), %ymm0 +vpsubw 3904(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 4096(%r10), %ymm6, %ymm6 +vmovdqa %ymm6, 3904(%r10) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 3520(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 3712(%r10) +vmovdqa 3744(%r10), %ymm1 +vpsubw 3936(%r10), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 4128(%r10), %ymm7, %ymm7 +vmovdqa %ymm7, 3936(%r10) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 3552(%r10), %ymm1, %ymm1 +vmovdqa %ymm1, 3744(%r10) +vmovdqa 3776(%r10), %ymm2 +vpsubw 3968(%r10), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 4160(%r10), %ymm8, %ymm8 +vmovdqa %ymm8, 3968(%r10) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 3584(%r10), %ymm2, %ymm2 +vmovdqa %ymm2, 3776(%r10) +vmovdqa 3808(%r10), %ymm3 +vpsubw 4000(%r10), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 4000(%r10) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 3616(%r10), %ymm3, %ymm3 +vmovdqa %ymm3, 3808(%r10) +vmovdqa 3840(%r10), %ymm4 +vpsubw 4032(%r10), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 3648(%r10), %ymm4, %ymm4 +vmovdqa %ymm4, 3840(%r10) +vmovdqa 0(%r9), %ymm0 +vmovdqa 1408(%r9), %ymm6 +vpaddw 352(%r9), %ymm0, %ymm0 +vpaddw 1760(%r9), %ymm6, %ymm6 +vmovdqa 32(%r9), %ymm1 +vmovdqa 1440(%r9), %ymm7 +vpaddw 384(%r9), %ymm1, %ymm1 +vpaddw 1792(%r9), %ymm7, %ymm7 +vmovdqa 64(%r9), %ymm2 +vmovdqa 1472(%r9), %ymm8 +vpaddw 416(%r9), %ymm2, %ymm2 +vpaddw 1824(%r9), %ymm8, %ymm8 +vmovdqa 96(%r9), %ymm3 +vmovdqa 1504(%r9), %ymm9 +vpaddw 448(%r9), %ymm3, %ymm3 +vpaddw 1856(%r9), %ymm9, %ymm9 +vmovdqa 128(%r9), %ymm4 +vmovdqa 1536(%r9), %ymm10 +vpaddw 480(%r9), %ymm4, %ymm4 +vpaddw 1888(%r9), %ymm10, %ymm10 +vmovdqa 160(%r9), %ymm5 +vmovdqa 1568(%r9), %ymm11 +vpaddw 512(%r9), %ymm5, %ymm5 +vpaddw 1920(%r9), %ymm11, %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 5888(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5920(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 5952(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5984(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6016(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6048(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6080(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6112(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6144(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6176(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 6208(%rsp) +vmovdqa 192(%r9), %ymm0 +vmovdqa 1600(%r9), %ymm6 +vpaddw 544(%r9), %ymm0, %ymm0 +vpaddw 1952(%r9), %ymm6, %ymm6 +vmovdqa 224(%r9), %ymm1 +vmovdqa 1632(%r9), %ymm7 +vpaddw 576(%r9), %ymm1, %ymm1 +vpaddw 1984(%r9), %ymm7, %ymm7 +vmovdqa 256(%r9), %ymm2 +vmovdqa 1664(%r9), %ymm8 +vpaddw 608(%r9), %ymm2, %ymm2 +vpaddw 2016(%r9), %ymm8, %ymm8 +vmovdqa 288(%r9), %ymm3 +vmovdqa 1696(%r9), %ymm9 +vpaddw 640(%r9), %ymm3, %ymm3 +vpaddw 2048(%r9), %ymm9, %ymm9 +vmovdqa 320(%r9), %ymm4 +vmovdqa 1728(%r9), %ymm10 +vpaddw 672(%r9), %ymm4, %ymm4 +vpaddw 2080(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 6272(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6304(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6336(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6368(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6400(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6432(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6464(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6496(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 6528(%rsp) +vpaddw 0(%r9), %ymm0, %ymm0 +vpaddw 1408(%r9), %ymm6, %ymm6 +vpaddw 352(%r9), %ymm0, %ymm0 +vpaddw 1760(%r9), %ymm6, %ymm6 +vpaddw 32(%r9), %ymm1, %ymm1 +vpaddw 1440(%r9), %ymm7, %ymm7 +vpaddw 384(%r9), %ymm1, %ymm1 +vpaddw 1792(%r9), %ymm7, %ymm7 +vpaddw 64(%r9), %ymm2, %ymm2 +vpaddw 1472(%r9), %ymm8, %ymm8 +vpaddw 416(%r9), %ymm2, %ymm2 +vpaddw 1824(%r9), %ymm8, %ymm8 +vpaddw 96(%r9), %ymm3, %ymm3 +vpaddw 1504(%r9), %ymm9, %ymm9 +vpaddw 448(%r9), %ymm3, %ymm3 +vpaddw 1856(%r9), %ymm9, %ymm9 +vpaddw 128(%r9), %ymm4, %ymm4 +vpaddw 1536(%r9), %ymm10, %ymm10 +vpaddw 480(%r9), %ymm4, %ymm4 +vpaddw 1888(%r9), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 6048(%rsp), %ymm12, %ymm12 +vpsubw 6432(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 6240(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 6080(%rsp), %ymm0 +vpsubw 6272(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 6464(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 6272(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 6080(%rsp) +vmovdqa 6112(%rsp), %ymm1 +vpsubw 6304(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 6496(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 6304(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 5920(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 6112(%rsp) +vmovdqa 6144(%rsp), %ymm2 +vpsubw 6336(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 6528(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 6336(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 5952(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 6144(%rsp) +vmovdqa 6176(%rsp), %ymm3 +vpsubw 6368(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 6368(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 5984(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 6176(%rsp) +vmovdqa 6208(%rsp), %ymm4 +vpsubw 6400(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 6016(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 6208(%rsp) +vmovdqa 6208(%rsp), %ymm0 +vpsubw 3136(%r10), %ymm0, %ymm0 +vpsubw 3840(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 3488(%r10) +vmovdqa 3168(%r10), %ymm0 +vpsubw 3520(%r10), %ymm0, %ymm0 +vmovdqa 6240(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3872(%r10), %ymm1, %ymm1 +vpsubw 2816(%r10), %ymm0, %ymm0 +vpaddw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3168(%r10) +vmovdqa %ymm1, 3520(%r10) +vmovdqa 3200(%r10), %ymm0 +vpsubw 3552(%r10), %ymm0, %ymm0 +vmovdqa 6272(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3904(%r10), %ymm1, %ymm1 +vpsubw 2848(%r10), %ymm0, %ymm0 +vpaddw 5920(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3200(%r10) +vmovdqa %ymm1, 3552(%r10) +vmovdqa 3232(%r10), %ymm0 +vpsubw 3584(%r10), %ymm0, %ymm0 +vmovdqa 6304(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3936(%r10), %ymm1, %ymm1 +vpsubw 2880(%r10), %ymm0, %ymm0 +vpaddw 5952(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3232(%r10) +vmovdqa %ymm1, 3584(%r10) +vmovdqa 3264(%r10), %ymm0 +vpsubw 3616(%r10), %ymm0, %ymm0 +vmovdqa 6336(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3968(%r10), %ymm1, %ymm1 +vpsubw 2912(%r10), %ymm0, %ymm0 +vpaddw 5984(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3264(%r10) +vmovdqa %ymm1, 3616(%r10) +vmovdqa 3296(%r10), %ymm0 +vpsubw 3648(%r10), %ymm0, %ymm0 +vmovdqa 6368(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4000(%r10), %ymm1, %ymm1 +vpsubw 2944(%r10), %ymm0, %ymm0 +vpaddw 6016(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3296(%r10) +vmovdqa %ymm1, 3648(%r10) +vmovdqa 3328(%r10), %ymm0 +vpsubw 3680(%r10), %ymm0, %ymm0 +vmovdqa 6400(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4032(%r10), %ymm1, %ymm1 +vpsubw 2976(%r10), %ymm0, %ymm0 +vpaddw 6048(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3328(%r10) +vmovdqa %ymm1, 3680(%r10) +vmovdqa 3360(%r10), %ymm0 +vpsubw 3712(%r10), %ymm0, %ymm0 +vmovdqa 6432(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4064(%r10), %ymm1, %ymm1 +vpsubw 3008(%r10), %ymm0, %ymm0 +vpaddw 6080(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3360(%r10) +vmovdqa %ymm1, 3712(%r10) +vmovdqa 3392(%r10), %ymm0 +vpsubw 3744(%r10), %ymm0, %ymm0 +vmovdqa 6464(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4096(%r10), %ymm1, %ymm1 +vpsubw 3040(%r10), %ymm0, %ymm0 +vpaddw 6112(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3392(%r10) +vmovdqa %ymm1, 3744(%r10) +vmovdqa 3424(%r10), %ymm0 +vpsubw 3776(%r10), %ymm0, %ymm0 +vmovdqa 6496(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4128(%r10), %ymm1, %ymm1 +vpsubw 3072(%r10), %ymm0, %ymm0 +vpaddw 6144(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3424(%r10) +vmovdqa %ymm1, 3776(%r10) +vmovdqa 3456(%r10), %ymm0 +vpsubw 3808(%r10), %ymm0, %ymm0 +vmovdqa 6528(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 4160(%r10), %ymm1, %ymm1 +vpsubw 3104(%r10), %ymm0, %ymm0 +vpaddw 6176(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3456(%r10) +vmovdqa %ymm1, 3808(%r10) +neg %ecx +jns done_4eced63f144beffcb0247f9c6f67d165 +add $704, %r9 +add $1408, %r10 +jmp innerloop_4eced63f144beffcb0247f9c6f67d165 +done_4eced63f144beffcb0247f9c6f67d165: +sub $704, %r9 +sub $1408, %r10 +vmovdqa 0(%r9), %ymm0 +vpaddw 704(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6592(%rsp) +vmovdqa 1408(%r9), %ymm0 +vpaddw 2112(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7296(%rsp) +vmovdqa 32(%r9), %ymm0 +vpaddw 736(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6624(%rsp) +vmovdqa 1440(%r9), %ymm0 +vpaddw 2144(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7328(%rsp) +vmovdqa 64(%r9), %ymm0 +vpaddw 768(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6656(%rsp) +vmovdqa 1472(%r9), %ymm0 +vpaddw 2176(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7360(%rsp) +vmovdqa 96(%r9), %ymm0 +vpaddw 800(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6688(%rsp) +vmovdqa 1504(%r9), %ymm0 +vpaddw 2208(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7392(%rsp) +vmovdqa 128(%r9), %ymm0 +vpaddw 832(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6720(%rsp) +vmovdqa 1536(%r9), %ymm0 +vpaddw 2240(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7424(%rsp) +vmovdqa 160(%r9), %ymm0 +vpaddw 864(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6752(%rsp) +vmovdqa 1568(%r9), %ymm0 +vpaddw 2272(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7456(%rsp) +vmovdqa 192(%r9), %ymm0 +vpaddw 896(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6784(%rsp) +vmovdqa 1600(%r9), %ymm0 +vpaddw 2304(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7488(%rsp) +vmovdqa 224(%r9), %ymm0 +vpaddw 928(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6816(%rsp) +vmovdqa 1632(%r9), %ymm0 +vpaddw 2336(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7520(%rsp) +vmovdqa 256(%r9), %ymm0 +vpaddw 960(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6848(%rsp) +vmovdqa 1664(%r9), %ymm0 +vpaddw 2368(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7552(%rsp) +vmovdqa 288(%r9), %ymm0 +vpaddw 992(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6880(%rsp) +vmovdqa 1696(%r9), %ymm0 +vpaddw 2400(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7584(%rsp) +vmovdqa 320(%r9), %ymm0 +vpaddw 1024(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6912(%rsp) +vmovdqa 1728(%r9), %ymm0 +vpaddw 2432(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7616(%rsp) +vmovdqa 352(%r9), %ymm0 +vpaddw 1056(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6944(%rsp) +vmovdqa 1760(%r9), %ymm0 +vpaddw 2464(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7648(%rsp) +vmovdqa 384(%r9), %ymm0 +vpaddw 1088(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 6976(%rsp) +vmovdqa 1792(%r9), %ymm0 +vpaddw 2496(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7680(%rsp) +vmovdqa 416(%r9), %ymm0 +vpaddw 1120(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7008(%rsp) +vmovdqa 1824(%r9), %ymm0 +vpaddw 2528(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7712(%rsp) +vmovdqa 448(%r9), %ymm0 +vpaddw 1152(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7040(%rsp) +vmovdqa 1856(%r9), %ymm0 +vpaddw 2560(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7744(%rsp) +vmovdqa 480(%r9), %ymm0 +vpaddw 1184(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7072(%rsp) +vmovdqa 1888(%r9), %ymm0 +vpaddw 2592(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7776(%rsp) +vmovdqa 512(%r9), %ymm0 +vpaddw 1216(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7104(%rsp) +vmovdqa 1920(%r9), %ymm0 +vpaddw 2624(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7808(%rsp) +vmovdqa 544(%r9), %ymm0 +vpaddw 1248(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7136(%rsp) +vmovdqa 1952(%r9), %ymm0 +vpaddw 2656(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7840(%rsp) +vmovdqa 576(%r9), %ymm0 +vpaddw 1280(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7168(%rsp) +vmovdqa 1984(%r9), %ymm0 +vpaddw 2688(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7872(%rsp) +vmovdqa 608(%r9), %ymm0 +vpaddw 1312(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7200(%rsp) +vmovdqa 2016(%r9), %ymm0 +vpaddw 2720(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7904(%rsp) +vmovdqa 640(%r9), %ymm0 +vpaddw 1344(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7232(%rsp) +vmovdqa 2048(%r9), %ymm0 +vpaddw 2752(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7936(%rsp) +vmovdqa 672(%r9), %ymm0 +vpaddw 1376(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7264(%rsp) +vmovdqa 2080(%r9), %ymm0 +vpaddw 2784(%r9), %ymm0, %ymm0 +vmovdqa %ymm0, 7968(%rsp) +vmovdqa 6592(%rsp), %ymm0 +vmovdqa 7296(%rsp), %ymm6 +vmovdqa 6624(%rsp), %ymm1 +vmovdqa 7328(%rsp), %ymm7 +vmovdqa 6656(%rsp), %ymm2 +vmovdqa 7360(%rsp), %ymm8 +vmovdqa 6688(%rsp), %ymm3 +vmovdqa 7392(%rsp), %ymm9 +vmovdqa 6720(%rsp), %ymm4 +vmovdqa 7424(%rsp), %ymm10 +vmovdqa 6752(%rsp), %ymm5 +vmovdqa 7456(%rsp), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 8000(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8032(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8064(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8096(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8128(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8160(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8192(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8224(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8256(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8288(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 8320(%rsp) +vmovdqa 6784(%rsp), %ymm0 +vmovdqa 7488(%rsp), %ymm6 +vmovdqa 6816(%rsp), %ymm1 +vmovdqa 7520(%rsp), %ymm7 +vmovdqa 6848(%rsp), %ymm2 +vmovdqa 7552(%rsp), %ymm8 +vmovdqa 6880(%rsp), %ymm3 +vmovdqa 7584(%rsp), %ymm9 +vmovdqa 6912(%rsp), %ymm4 +vmovdqa 7616(%rsp), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 8384(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8416(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8448(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8480(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8512(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8544(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8576(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8608(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 8640(%rsp) +vpaddw 6592(%rsp), %ymm0, %ymm0 +vpaddw 7296(%rsp), %ymm6, %ymm6 +vpaddw 6624(%rsp), %ymm1, %ymm1 +vpaddw 7328(%rsp), %ymm7, %ymm7 +vpaddw 6656(%rsp), %ymm2, %ymm2 +vpaddw 7360(%rsp), %ymm8, %ymm8 +vpaddw 6688(%rsp), %ymm3, %ymm3 +vpaddw 7392(%rsp), %ymm9, %ymm9 +vpaddw 6720(%rsp), %ymm4, %ymm4 +vpaddw 7424(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 8160(%rsp), %ymm12, %ymm12 +vpsubw 8544(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 8352(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 8192(%rsp), %ymm0 +vpsubw 8384(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 8576(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 8384(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 8000(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8192(%rsp) +vmovdqa 8224(%rsp), %ymm1 +vpsubw 8416(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 8608(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 8416(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 8032(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 8224(%rsp) +vmovdqa 8256(%rsp), %ymm2 +vpsubw 8448(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 8640(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 8448(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 8064(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 8256(%rsp) +vmovdqa 8288(%rsp), %ymm3 +vpsubw 8480(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 8480(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 8096(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 8288(%rsp) +vmovdqa 8320(%rsp), %ymm4 +vpsubw 8512(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 8128(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 8320(%rsp) +vmovdqa 6944(%rsp), %ymm0 +vmovdqa 7648(%rsp), %ymm6 +vmovdqa 6976(%rsp), %ymm1 +vmovdqa 7680(%rsp), %ymm7 +vmovdqa 7008(%rsp), %ymm2 +vmovdqa 7712(%rsp), %ymm8 +vmovdqa 7040(%rsp), %ymm3 +vmovdqa 7744(%rsp), %ymm9 +vmovdqa 7072(%rsp), %ymm4 +vmovdqa 7776(%rsp), %ymm10 +vmovdqa 7104(%rsp), %ymm5 +vmovdqa 7808(%rsp), %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 8704(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8736(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8768(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8800(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8832(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8864(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8896(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8928(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 8960(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 8992(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 9024(%rsp) +vmovdqa 7136(%rsp), %ymm0 +vmovdqa 7840(%rsp), %ymm6 +vmovdqa 7168(%rsp), %ymm1 +vmovdqa 7872(%rsp), %ymm7 +vmovdqa 7200(%rsp), %ymm2 +vmovdqa 7904(%rsp), %ymm8 +vmovdqa 7232(%rsp), %ymm3 +vmovdqa 7936(%rsp), %ymm9 +vmovdqa 7264(%rsp), %ymm4 +vmovdqa 7968(%rsp), %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 9088(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9120(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 9152(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9184(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 9216(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9248(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 9280(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 9312(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 9344(%rsp) +vpaddw 6944(%rsp), %ymm0, %ymm0 +vpaddw 7648(%rsp), %ymm6, %ymm6 +vpaddw 6976(%rsp), %ymm1, %ymm1 +vpaddw 7680(%rsp), %ymm7, %ymm7 +vpaddw 7008(%rsp), %ymm2, %ymm2 +vpaddw 7712(%rsp), %ymm8, %ymm8 +vpaddw 7040(%rsp), %ymm3, %ymm3 +vpaddw 7744(%rsp), %ymm9, %ymm9 +vpaddw 7072(%rsp), %ymm4, %ymm4 +vpaddw 7776(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 8864(%rsp), %ymm12, %ymm12 +vpsubw 9248(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 9056(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 8896(%rsp), %ymm0 +vpsubw 9088(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 9280(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 9088(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 8704(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8896(%rsp) +vmovdqa 8928(%rsp), %ymm1 +vpsubw 9120(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 9312(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 9120(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 8736(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 8928(%rsp) +vmovdqa 8960(%rsp), %ymm2 +vpsubw 9152(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 9344(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 9152(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 8768(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 8960(%rsp) +vmovdqa 8992(%rsp), %ymm3 +vpsubw 9184(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 9184(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 8800(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 8992(%rsp) +vmovdqa 9024(%rsp), %ymm4 +vpsubw 9216(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 8832(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 9024(%rsp) +vmovdqa 6592(%rsp), %ymm0 +vmovdqa 7296(%rsp), %ymm6 +vpaddw 6944(%rsp), %ymm0, %ymm0 +vpaddw 7648(%rsp), %ymm6, %ymm6 +vmovdqa 6624(%rsp), %ymm1 +vmovdqa 7328(%rsp), %ymm7 +vpaddw 6976(%rsp), %ymm1, %ymm1 +vpaddw 7680(%rsp), %ymm7, %ymm7 +vmovdqa 6656(%rsp), %ymm2 +vmovdqa 7360(%rsp), %ymm8 +vpaddw 7008(%rsp), %ymm2, %ymm2 +vpaddw 7712(%rsp), %ymm8, %ymm8 +vmovdqa 6688(%rsp), %ymm3 +vmovdqa 7392(%rsp), %ymm9 +vpaddw 7040(%rsp), %ymm3, %ymm3 +vpaddw 7744(%rsp), %ymm9, %ymm9 +vmovdqa 6720(%rsp), %ymm4 +vmovdqa 7424(%rsp), %ymm10 +vpaddw 7072(%rsp), %ymm4, %ymm4 +vpaddw 7776(%rsp), %ymm10, %ymm10 +vmovdqa 6752(%rsp), %ymm5 +vmovdqa 7456(%rsp), %ymm11 +vpaddw 7104(%rsp), %ymm5, %ymm5 +vpaddw 7808(%rsp), %ymm11, %ymm11 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 5888(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5920(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 5952(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 5984(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6016(%rsp) +vpmullw %ymm0, %ymm11, %ymm13 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6048(%rsp) +vpmullw %ymm1, %ymm11, %ymm12 +vpmullw %ymm2, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6080(%rsp) +vpmullw %ymm2, %ymm11, %ymm13 +vpmullw %ymm3, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm5, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6112(%rsp) +vpmullw %ymm3, %ymm11, %ymm12 +vpmullw %ymm4, %ymm10, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm5, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6144(%rsp) +vpmullw %ymm4, %ymm11, %ymm13 +vpmullw %ymm5, %ymm10, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6176(%rsp) +vpmullw %ymm5, %ymm11, %ymm12 +vmovdqa %ymm12, 6208(%rsp) +vmovdqa 6784(%rsp), %ymm0 +vmovdqa 7488(%rsp), %ymm6 +vpaddw 7136(%rsp), %ymm0, %ymm0 +vpaddw 7840(%rsp), %ymm6, %ymm6 +vmovdqa 6816(%rsp), %ymm1 +vmovdqa 7520(%rsp), %ymm7 +vpaddw 7168(%rsp), %ymm1, %ymm1 +vpaddw 7872(%rsp), %ymm7, %ymm7 +vmovdqa 6848(%rsp), %ymm2 +vmovdqa 7552(%rsp), %ymm8 +vpaddw 7200(%rsp), %ymm2, %ymm2 +vpaddw 7904(%rsp), %ymm8, %ymm8 +vmovdqa 6880(%rsp), %ymm3 +vmovdqa 7584(%rsp), %ymm9 +vpaddw 7232(%rsp), %ymm3, %ymm3 +vpaddw 7936(%rsp), %ymm9, %ymm9 +vmovdqa 6912(%rsp), %ymm4 +vmovdqa 7616(%rsp), %ymm10 +vpaddw 7264(%rsp), %ymm4, %ymm4 +vpaddw 7968(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm6, %ymm12 +vmovdqa %ymm12, 6272(%rsp) +vpmullw %ymm0, %ymm7, %ymm13 +vpmullw %ymm1, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6304(%rsp) +vpmullw %ymm0, %ymm8, %ymm12 +vpmullw %ymm1, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6336(%rsp) +vpmullw %ymm0, %ymm9, %ymm13 +vpmullw %ymm1, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm2, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm6, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6368(%rsp) +vpmullw %ymm0, %ymm10, %ymm12 +vpmullw %ymm1, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm2, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm3, %ymm7, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm6, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6400(%rsp) +vpmullw %ymm1, %ymm10, %ymm13 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6432(%rsp) +vpmullw %ymm2, %ymm10, %ymm12 +vpmullw %ymm3, %ymm9, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vpmullw %ymm4, %ymm8, %ymm15 +vpaddw %ymm12, %ymm15, %ymm12 +vmovdqa %ymm12, 6464(%rsp) +vpmullw %ymm3, %ymm10, %ymm13 +vpmullw %ymm4, %ymm9, %ymm15 +vpaddw %ymm13, %ymm15, %ymm13 +vmovdqa %ymm13, 6496(%rsp) +vpmullw %ymm4, %ymm10, %ymm12 +vmovdqa %ymm12, 6528(%rsp) +vpaddw 6592(%rsp), %ymm0, %ymm0 +vpaddw 7296(%rsp), %ymm6, %ymm6 +vpaddw 6944(%rsp), %ymm0, %ymm0 +vpaddw 7648(%rsp), %ymm6, %ymm6 +vpaddw 6624(%rsp), %ymm1, %ymm1 +vpaddw 7328(%rsp), %ymm7, %ymm7 +vpaddw 6976(%rsp), %ymm1, %ymm1 +vpaddw 7680(%rsp), %ymm7, %ymm7 +vpaddw 6656(%rsp), %ymm2, %ymm2 +vpaddw 7360(%rsp), %ymm8, %ymm8 +vpaddw 7008(%rsp), %ymm2, %ymm2 +vpaddw 7712(%rsp), %ymm8, %ymm8 +vpaddw 6688(%rsp), %ymm3, %ymm3 +vpaddw 7392(%rsp), %ymm9, %ymm9 +vpaddw 7040(%rsp), %ymm3, %ymm3 +vpaddw 7744(%rsp), %ymm9, %ymm9 +vpaddw 6720(%rsp), %ymm4, %ymm4 +vpaddw 7424(%rsp), %ymm10, %ymm10 +vpaddw 7072(%rsp), %ymm4, %ymm4 +vpaddw 7776(%rsp), %ymm10, %ymm10 +vpmullw %ymm0, %ymm11, %ymm12 +vpmullw %ymm1, %ymm10, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm2, %ymm9, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm3, %ymm8, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm4, %ymm7, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpmullw %ymm5, %ymm6, %ymm15 +vpaddw %ymm15, %ymm12, %ymm12 +vpsubw 6048(%rsp), %ymm12, %ymm12 +vpsubw 6432(%rsp), %ymm12, %ymm12 +vmovdqa %ymm12, 6240(%rsp) +vpmullw %ymm5, %ymm7, %ymm12 +vpmullw %ymm5, %ymm8, %ymm13 +vpmullw %ymm5, %ymm9, %ymm14 +vpmullw %ymm5, %ymm10, %ymm15 +vpmullw %ymm1, %ymm11, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm10, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm3, %ymm9, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm4, %ymm8, %ymm5 +vpaddw %ymm5, %ymm12, %ymm12 +vpmullw %ymm2, %ymm11, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm10, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm4, %ymm9, %ymm5 +vpaddw %ymm5, %ymm13, %ymm13 +vpmullw %ymm3, %ymm11, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm10, %ymm5 +vpaddw %ymm5, %ymm14, %ymm14 +vpmullw %ymm4, %ymm11, %ymm5 +vpaddw %ymm5, %ymm15, %ymm15 +vpmullw %ymm0, %ymm10, %ymm11 +vpmullw %ymm1, %ymm9, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm2, %ymm8, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm3, %ymm7, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm4, %ymm6, %ymm5 +vpaddw %ymm5, %ymm11, %ymm11 +vpmullw %ymm0, %ymm9, %ymm10 +vpmullw %ymm1, %ymm8, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm2, %ymm7, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm3, %ymm6, %ymm5 +vpaddw %ymm5, %ymm10, %ymm10 +vpmullw %ymm0, %ymm8, %ymm9 +vpmullw %ymm1, %ymm7, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm2, %ymm6, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vpmullw %ymm0, %ymm7, %ymm8 +vpmullw %ymm1, %ymm6, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vpmullw %ymm0, %ymm6, %ymm7 +vmovdqa 6080(%rsp), %ymm0 +vpsubw 6272(%rsp), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm6 +vpsubw 6464(%rsp), %ymm6, %ymm6 +vmovdqa %ymm6, 6272(%rsp) +vpaddw %ymm7, %ymm0, %ymm0 +vpsubw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 6080(%rsp) +vmovdqa 6112(%rsp), %ymm1 +vpsubw 6304(%rsp), %ymm1, %ymm1 +vpsubw %ymm1, %ymm13, %ymm7 +vpsubw 6496(%rsp), %ymm7, %ymm7 +vmovdqa %ymm7, 6304(%rsp) +vpaddw %ymm8, %ymm1, %ymm1 +vpsubw 5920(%rsp), %ymm1, %ymm1 +vmovdqa %ymm1, 6112(%rsp) +vmovdqa 6144(%rsp), %ymm2 +vpsubw 6336(%rsp), %ymm2, %ymm2 +vpsubw %ymm2, %ymm14, %ymm8 +vpsubw 6528(%rsp), %ymm8, %ymm8 +vmovdqa %ymm8, 6336(%rsp) +vpaddw %ymm9, %ymm2, %ymm2 +vpsubw 5952(%rsp), %ymm2, %ymm2 +vmovdqa %ymm2, 6144(%rsp) +vmovdqa 6176(%rsp), %ymm3 +vpsubw 6368(%rsp), %ymm3, %ymm3 +vpsubw %ymm3, %ymm15, %ymm9 +vmovdqa %ymm9, 6368(%rsp) +vpaddw %ymm10, %ymm3, %ymm3 +vpsubw 5984(%rsp), %ymm3, %ymm3 +vmovdqa %ymm3, 6176(%rsp) +vmovdqa 6208(%rsp), %ymm4 +vpsubw 6400(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vpsubw 6016(%rsp), %ymm4, %ymm4 +vmovdqa %ymm4, 6208(%rsp) +vmovdqa 8352(%rsp), %ymm0 +vpsubw 8704(%rsp), %ymm0, %ymm0 +vmovdqa 6240(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9056(%rsp), %ymm1, %ymm6 +vpsubw 8000(%rsp), %ymm0, %ymm0 +vpaddw 5888(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8352(%rsp) +vmovdqa 8384(%rsp), %ymm0 +vpsubw 8736(%rsp), %ymm0, %ymm0 +vmovdqa 6272(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9088(%rsp), %ymm1, %ymm7 +vpsubw 8032(%rsp), %ymm0, %ymm0 +vpaddw 5920(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8384(%rsp) +vmovdqa 8416(%rsp), %ymm0 +vpsubw 8768(%rsp), %ymm0, %ymm0 +vmovdqa 6304(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9120(%rsp), %ymm1, %ymm8 +vpsubw 8064(%rsp), %ymm0, %ymm0 +vpaddw 5952(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8416(%rsp) +vmovdqa 8448(%rsp), %ymm0 +vpsubw 8800(%rsp), %ymm0, %ymm0 +vmovdqa 6336(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9152(%rsp), %ymm1, %ymm9 +vpsubw 8096(%rsp), %ymm0, %ymm0 +vpaddw 5984(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8448(%rsp) +vmovdqa 8480(%rsp), %ymm0 +vpsubw 8832(%rsp), %ymm0, %ymm0 +vmovdqa 6368(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9184(%rsp), %ymm1, %ymm10 +vpsubw 8128(%rsp), %ymm0, %ymm0 +vpaddw 6016(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8480(%rsp) +vmovdqa 8512(%rsp), %ymm0 +vpsubw 8864(%rsp), %ymm0, %ymm0 +vmovdqa 6400(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9216(%rsp), %ymm1, %ymm11 +vpsubw 8160(%rsp), %ymm0, %ymm0 +vpaddw 6048(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8512(%rsp) +vmovdqa 8544(%rsp), %ymm0 +vpsubw 8896(%rsp), %ymm0, %ymm0 +vmovdqa 6432(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9248(%rsp), %ymm1, %ymm12 +vpsubw 8192(%rsp), %ymm0, %ymm0 +vpaddw 6080(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8544(%rsp) +vmovdqa 8576(%rsp), %ymm0 +vpsubw 8928(%rsp), %ymm0, %ymm0 +vmovdqa 6464(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9280(%rsp), %ymm1, %ymm13 +vpsubw 8224(%rsp), %ymm0, %ymm0 +vpaddw 6112(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8576(%rsp) +vmovdqa 8608(%rsp), %ymm0 +vpsubw 8960(%rsp), %ymm0, %ymm0 +vmovdqa 6496(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9312(%rsp), %ymm1, %ymm14 +vpsubw 8256(%rsp), %ymm0, %ymm0 +vpaddw 6144(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8608(%rsp) +vmovdqa 8640(%rsp), %ymm0 +vpsubw 8992(%rsp), %ymm0, %ymm0 +vmovdqa 6528(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 9344(%rsp), %ymm1, %ymm15 +vpsubw 8288(%rsp), %ymm0, %ymm0 +vpaddw 6176(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 8640(%rsp) +vmovdqa 6208(%rsp), %ymm0 +vpsubw 8320(%rsp), %ymm0, %ymm0 +vpsubw 9024(%rsp), %ymm0, %ymm0 +vpsubw 3488(%r10), %ymm0, %ymm0 +vpsubw 4896(%r10), %ymm0, %ymm0 +vmovdqa %ymm0, 4192(%r10) +vmovdqa 3520(%r10), %ymm0 +vpsubw 4224(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm6, %ymm6 +vpsubw 4928(%r10), %ymm6, %ymm6 +vpsubw 2816(%r10), %ymm0, %ymm0 +vpaddw 8000(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3520(%r10) +vmovdqa %ymm6, 4224(%r10) +vmovdqa 3552(%r10), %ymm0 +vpsubw 4256(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm7, %ymm7 +vpsubw 4960(%r10), %ymm7, %ymm7 +vpsubw 2848(%r10), %ymm0, %ymm0 +vpaddw 8032(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3552(%r10) +vmovdqa %ymm7, 4256(%r10) +vmovdqa 3584(%r10), %ymm0 +vpsubw 4288(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm8, %ymm8 +vpsubw 4992(%r10), %ymm8, %ymm8 +vpsubw 2880(%r10), %ymm0, %ymm0 +vpaddw 8064(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3584(%r10) +vmovdqa %ymm8, 4288(%r10) +vmovdqa 3616(%r10), %ymm0 +vpsubw 4320(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm9, %ymm9 +vpsubw 5024(%r10), %ymm9, %ymm9 +vpsubw 2912(%r10), %ymm0, %ymm0 +vpaddw 8096(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3616(%r10) +vmovdqa %ymm9, 4320(%r10) +vmovdqa 3648(%r10), %ymm0 +vpsubw 4352(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm10, %ymm10 +vpsubw 5056(%r10), %ymm10, %ymm10 +vpsubw 2944(%r10), %ymm0, %ymm0 +vpaddw 8128(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3648(%r10) +vmovdqa %ymm10, 4352(%r10) +vmovdqa 3680(%r10), %ymm0 +vpsubw 4384(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm11, %ymm11 +vpsubw 5088(%r10), %ymm11, %ymm11 +vpsubw 2976(%r10), %ymm0, %ymm0 +vpaddw 8160(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3680(%r10) +vmovdqa %ymm11, 4384(%r10) +vmovdqa 3712(%r10), %ymm0 +vpsubw 4416(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm12, %ymm12 +vpsubw 5120(%r10), %ymm12, %ymm12 +vpsubw 3008(%r10), %ymm0, %ymm0 +vpaddw 8192(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3712(%r10) +vmovdqa %ymm12, 4416(%r10) +vmovdqa 3744(%r10), %ymm0 +vpsubw 4448(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm13, %ymm13 +vpsubw 5152(%r10), %ymm13, %ymm13 +vpsubw 3040(%r10), %ymm0, %ymm0 +vpaddw 8224(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3744(%r10) +vmovdqa %ymm13, 4448(%r10) +vmovdqa 3776(%r10), %ymm0 +vpsubw 4480(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm14, %ymm14 +vpsubw 5184(%r10), %ymm14, %ymm14 +vpsubw 3072(%r10), %ymm0, %ymm0 +vpaddw 8256(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3776(%r10) +vmovdqa %ymm14, 4480(%r10) +vmovdqa 3808(%r10), %ymm0 +vpsubw 4512(%r10), %ymm0, %ymm0 +vpsubw %ymm0, %ymm15, %ymm15 +vpsubw 5216(%r10), %ymm15, %ymm15 +vpsubw 3104(%r10), %ymm0, %ymm0 +vpaddw 8288(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3808(%r10) +vmovdqa %ymm15, 4512(%r10) +vmovdqa 3840(%r10), %ymm0 +vpsubw 4544(%r10), %ymm0, %ymm0 +vmovdqa 9024(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5248(%r10), %ymm1, %ymm1 +vpsubw 3136(%r10), %ymm0, %ymm0 +vpaddw 8320(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3840(%r10) +vmovdqa %ymm1, 4544(%r10) +vmovdqa 3872(%r10), %ymm0 +vpsubw 4576(%r10), %ymm0, %ymm0 +vmovdqa 9056(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5280(%r10), %ymm1, %ymm1 +vpsubw 3168(%r10), %ymm0, %ymm0 +vpaddw 8352(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3872(%r10) +vmovdqa %ymm1, 4576(%r10) +vmovdqa 3904(%r10), %ymm0 +vpsubw 4608(%r10), %ymm0, %ymm0 +vmovdqa 9088(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5312(%r10), %ymm1, %ymm1 +vpsubw 3200(%r10), %ymm0, %ymm0 +vpaddw 8384(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3904(%r10) +vmovdqa %ymm1, 4608(%r10) +vmovdqa 3936(%r10), %ymm0 +vpsubw 4640(%r10), %ymm0, %ymm0 +vmovdqa 9120(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5344(%r10), %ymm1, %ymm1 +vpsubw 3232(%r10), %ymm0, %ymm0 +vpaddw 8416(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3936(%r10) +vmovdqa %ymm1, 4640(%r10) +vmovdqa 3968(%r10), %ymm0 +vpsubw 4672(%r10), %ymm0, %ymm0 +vmovdqa 9152(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5376(%r10), %ymm1, %ymm1 +vpsubw 3264(%r10), %ymm0, %ymm0 +vpaddw 8448(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 3968(%r10) +vmovdqa %ymm1, 4672(%r10) +vmovdqa 4000(%r10), %ymm0 +vpsubw 4704(%r10), %ymm0, %ymm0 +vmovdqa 9184(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5408(%r10), %ymm1, %ymm1 +vpsubw 3296(%r10), %ymm0, %ymm0 +vpaddw 8480(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4000(%r10) +vmovdqa %ymm1, 4704(%r10) +vmovdqa 4032(%r10), %ymm0 +vpsubw 4736(%r10), %ymm0, %ymm0 +vmovdqa 9216(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5440(%r10), %ymm1, %ymm1 +vpsubw 3328(%r10), %ymm0, %ymm0 +vpaddw 8512(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4032(%r10) +vmovdqa %ymm1, 4736(%r10) +vmovdqa 4064(%r10), %ymm0 +vpsubw 4768(%r10), %ymm0, %ymm0 +vmovdqa 9248(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5472(%r10), %ymm1, %ymm1 +vpsubw 3360(%r10), %ymm0, %ymm0 +vpaddw 8544(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4064(%r10) +vmovdqa %ymm1, 4768(%r10) +vmovdqa 4096(%r10), %ymm0 +vpsubw 4800(%r10), %ymm0, %ymm0 +vmovdqa 9280(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5504(%r10), %ymm1, %ymm1 +vpsubw 3392(%r10), %ymm0, %ymm0 +vpaddw 8576(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4096(%r10) +vmovdqa %ymm1, 4800(%r10) +vmovdqa 4128(%r10), %ymm0 +vpsubw 4832(%r10), %ymm0, %ymm0 +vmovdqa 9312(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5536(%r10), %ymm1, %ymm1 +vpsubw 3424(%r10), %ymm0, %ymm0 +vpaddw 8608(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4128(%r10) +vmovdqa %ymm1, 4832(%r10) +vmovdqa 4160(%r10), %ymm0 +vpsubw 4864(%r10), %ymm0, %ymm0 +vmovdqa 9344(%rsp), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5568(%r10), %ymm1, %ymm1 +vpsubw 3456(%r10), %ymm0, %ymm0 +vpaddw 8640(%rsp), %ymm0, %ymm0 +vmovdqa %ymm0, 4160(%r10) +vmovdqa %ymm1, 4864(%r10) +vpxor %ymm1, %ymm1, %ymm1 +vmovdqa %ymm1, 5600(%r10) +subq $32, %rsp +vmovdqa 2816(%r10), %ymm0 +vmovdqa 2880(%r10), %ymm1 +vmovdqa 2944(%r10), %ymm2 +vmovdqa 3008(%r10), %ymm3 +vpunpcklwd 2848(%r10), %ymm0, %ymm4 +vpunpckhwd 2848(%r10), %ymm0, %ymm5 +vpunpcklwd 2912(%r10), %ymm1, %ymm6 +vpunpckhwd 2912(%r10), %ymm1, %ymm7 +vpunpcklwd 2976(%r10), %ymm2, %ymm8 +vpunpckhwd 2976(%r10), %ymm2, %ymm9 +vpunpcklwd 3040(%r10), %ymm3, %ymm10 +vpunpckhwd 3040(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 3072(%r10), %ymm0 +vmovdqa 3136(%r10), %ymm1 +vmovdqa 3200(%r10), %ymm2 +vmovdqa 3264(%r10), %ymm3 +vpunpcklwd 3104(%r10), %ymm0, %ymm12 +vpunpckhwd 3104(%r10), %ymm0, %ymm13 +vpunpcklwd 3168(%r10), %ymm1, %ymm14 +vpunpckhwd 3168(%r10), %ymm1, %ymm15 +vpunpcklwd 3232(%r10), %ymm2, %ymm0 +vpunpckhwd 3232(%r10), %ymm2, %ymm1 +vpunpcklwd 3296(%r10), %ymm3, %ymm2 +vpunpckhwd 3296(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 0(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 192(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 384(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 576(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 768(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 960(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1152(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1536(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1728(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1920(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2112(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2304(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2496(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2688(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1344(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2880(%r12) +vmovdqa 3328(%r10), %ymm0 +vmovdqa 3392(%r10), %ymm1 +vmovdqa 3456(%r10), %ymm2 +vmovdqa 3520(%r10), %ymm3 +vpunpcklwd 3360(%r10), %ymm0, %ymm4 +vpunpckhwd 3360(%r10), %ymm0, %ymm5 +vpunpcklwd 3424(%r10), %ymm1, %ymm6 +vpunpckhwd 3424(%r10), %ymm1, %ymm7 +vpunpcklwd 3488(%r10), %ymm2, %ymm8 +vpunpckhwd 3488(%r10), %ymm2, %ymm9 +vpunpcklwd 3552(%r10), %ymm3, %ymm10 +vpunpckhwd 3552(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 3584(%r10), %ymm0 +vmovdqa 3648(%r10), %ymm1 +vmovdqa 3712(%r10), %ymm2 +vmovdqa 3776(%r10), %ymm3 +vpunpcklwd 3616(%r10), %ymm0, %ymm12 +vpunpckhwd 3616(%r10), %ymm0, %ymm13 +vpunpcklwd 3680(%r10), %ymm1, %ymm14 +vpunpckhwd 3680(%r10), %ymm1, %ymm15 +vpunpcklwd 3744(%r10), %ymm2, %ymm0 +vpunpckhwd 3744(%r10), %ymm2, %ymm1 +vpunpcklwd 3808(%r10), %ymm3, %ymm2 +vpunpckhwd 3808(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 32(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 224(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 416(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 608(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 800(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 992(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1184(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1568(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1760(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1952(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2144(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2336(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2528(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2720(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1376(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2912(%r12) +vmovdqa 3840(%r10), %ymm0 +vmovdqa 3904(%r10), %ymm1 +vmovdqa 3968(%r10), %ymm2 +vmovdqa 4032(%r10), %ymm3 +vpunpcklwd 3872(%r10), %ymm0, %ymm4 +vpunpckhwd 3872(%r10), %ymm0, %ymm5 +vpunpcklwd 3936(%r10), %ymm1, %ymm6 +vpunpckhwd 3936(%r10), %ymm1, %ymm7 +vpunpcklwd 4000(%r10), %ymm2, %ymm8 +vpunpckhwd 4000(%r10), %ymm2, %ymm9 +vpunpcklwd 4064(%r10), %ymm3, %ymm10 +vpunpckhwd 4064(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 4096(%r10), %ymm0 +vmovdqa 4160(%r10), %ymm1 +vmovdqa 4224(%r10), %ymm2 +vmovdqa 4288(%r10), %ymm3 +vpunpcklwd 4128(%r10), %ymm0, %ymm12 +vpunpckhwd 4128(%r10), %ymm0, %ymm13 +vpunpcklwd 4192(%r10), %ymm1, %ymm14 +vpunpckhwd 4192(%r10), %ymm1, %ymm15 +vpunpcklwd 4256(%r10), %ymm2, %ymm0 +vpunpckhwd 4256(%r10), %ymm2, %ymm1 +vpunpcklwd 4320(%r10), %ymm3, %ymm2 +vpunpckhwd 4320(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 64(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 256(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 448(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 640(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 832(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1024(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1216(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1600(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1792(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 1984(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2176(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2368(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2560(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2752(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1408(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2944(%r12) +vmovdqa 4224(%r10), %ymm0 +vmovdqa 4288(%r10), %ymm1 +vmovdqa 4352(%r10), %ymm2 +vmovdqa 4416(%r10), %ymm3 +vpunpcklwd 4256(%r10), %ymm0, %ymm4 +vpunpckhwd 4256(%r10), %ymm0, %ymm5 +vpunpcklwd 4320(%r10), %ymm1, %ymm6 +vpunpckhwd 4320(%r10), %ymm1, %ymm7 +vpunpcklwd 4384(%r10), %ymm2, %ymm8 +vpunpckhwd 4384(%r10), %ymm2, %ymm9 +vpunpcklwd 4448(%r10), %ymm3, %ymm10 +vpunpckhwd 4448(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 4480(%r10), %ymm0 +vmovdqa 4544(%r10), %ymm1 +vmovdqa 4608(%r10), %ymm2 +vmovdqa 4672(%r10), %ymm3 +vpunpcklwd 4512(%r10), %ymm0, %ymm12 +vpunpckhwd 4512(%r10), %ymm0, %ymm13 +vpunpcklwd 4576(%r10), %ymm1, %ymm14 +vpunpckhwd 4576(%r10), %ymm1, %ymm15 +vpunpcklwd 4640(%r10), %ymm2, %ymm0 +vpunpckhwd 4640(%r10), %ymm2, %ymm1 +vpunpcklwd 4704(%r10), %ymm3, %ymm2 +vpunpckhwd 4704(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 96(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 288(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 480(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 672(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 864(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1056(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1248(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1632(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1824(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2016(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2208(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2400(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2592(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2784(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1440(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 2976(%r12) +vmovdqa 4736(%r10), %ymm0 +vmovdqa 4800(%r10), %ymm1 +vmovdqa 4864(%r10), %ymm2 +vmovdqa 4928(%r10), %ymm3 +vpunpcklwd 4768(%r10), %ymm0, %ymm4 +vpunpckhwd 4768(%r10), %ymm0, %ymm5 +vpunpcklwd 4832(%r10), %ymm1, %ymm6 +vpunpckhwd 4832(%r10), %ymm1, %ymm7 +vpunpcklwd 4896(%r10), %ymm2, %ymm8 +vpunpckhwd 4896(%r10), %ymm2, %ymm9 +vpunpcklwd 4960(%r10), %ymm3, %ymm10 +vpunpckhwd 4960(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 4992(%r10), %ymm0 +vmovdqa 5056(%r10), %ymm1 +vmovdqa 5120(%r10), %ymm2 +vmovdqa 5184(%r10), %ymm3 +vpunpcklwd 5024(%r10), %ymm0, %ymm12 +vpunpckhwd 5024(%r10), %ymm0, %ymm13 +vpunpcklwd 5088(%r10), %ymm1, %ymm14 +vpunpckhwd 5088(%r10), %ymm1, %ymm15 +vpunpcklwd 5152(%r10), %ymm2, %ymm0 +vpunpckhwd 5152(%r10), %ymm2, %ymm1 +vpunpcklwd 5216(%r10), %ymm3, %ymm2 +vpunpckhwd 5216(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 128(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 320(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 512(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 704(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 896(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1088(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1280(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1664(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1856(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2048(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2240(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2432(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2624(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2816(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1472(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 3008(%r12) +vmovdqa 5248(%r10), %ymm0 +vmovdqa 5312(%r10), %ymm1 +vmovdqa 5376(%r10), %ymm2 +vmovdqa 5440(%r10), %ymm3 +vpunpcklwd 5280(%r10), %ymm0, %ymm4 +vpunpckhwd 5280(%r10), %ymm0, %ymm5 +vpunpcklwd 5344(%r10), %ymm1, %ymm6 +vpunpckhwd 5344(%r10), %ymm1, %ymm7 +vpunpcklwd 5408(%r10), %ymm2, %ymm8 +vpunpckhwd 5408(%r10), %ymm2, %ymm9 +vpunpcklwd 5472(%r10), %ymm3, %ymm10 +vpunpckhwd 5472(%r10), %ymm3, %ymm11 +vpunpckldq %ymm6, %ymm4, %ymm0 +vpunpckhdq %ymm6, %ymm4, %ymm1 +vpunpckldq %ymm7, %ymm5, %ymm2 +vpunpckhdq %ymm7, %ymm5, %ymm3 +vpunpckldq %ymm10, %ymm8, %ymm12 +vpunpckhdq %ymm10, %ymm8, %ymm13 +vpunpckldq %ymm11, %ymm9, %ymm14 +vpunpckhdq %ymm11, %ymm9, %ymm15 +vpunpcklqdq %ymm12, %ymm0, %ymm4 +vpunpckhqdq %ymm12, %ymm0, %ymm5 +vpunpcklqdq %ymm13, %ymm1, %ymm6 +vpunpckhqdq %ymm13, %ymm1, %ymm7 +vpunpcklqdq %ymm14, %ymm2, %ymm8 +vpunpckhqdq %ymm14, %ymm2, %ymm9 +vpunpcklqdq %ymm15, %ymm3, %ymm10 +vpunpckhqdq %ymm15, %ymm3, %ymm11 +vmovdqa 5504(%r10), %ymm0 +vmovdqa 5568(%r10), %ymm1 +vmovdqa 5632(%r10), %ymm2 +vmovdqa 5696(%r10), %ymm3 +vpunpcklwd 5536(%r10), %ymm0, %ymm12 +vpunpckhwd 5536(%r10), %ymm0, %ymm13 +vpunpcklwd 5600(%r10), %ymm1, %ymm14 +vpunpckhwd 5600(%r10), %ymm1, %ymm15 +vpunpcklwd 5664(%r10), %ymm2, %ymm0 +vpunpckhwd 5664(%r10), %ymm2, %ymm1 +vpunpcklwd 5728(%r10), %ymm3, %ymm2 +vpunpckhwd 5728(%r10), %ymm3, %ymm3 +vmovdqa %ymm11, 0(%rsp) +vpunpckldq %ymm14, %ymm12, %ymm11 +vpunpckhdq %ymm14, %ymm12, %ymm12 +vpunpckldq %ymm15, %ymm13, %ymm14 +vpunpckhdq %ymm15, %ymm13, %ymm15 +vpunpckldq %ymm2, %ymm0, %ymm13 +vpunpckhdq %ymm2, %ymm0, %ymm0 +vpunpckldq %ymm3, %ymm1, %ymm2 +vpunpckhdq %ymm3, %ymm1, %ymm1 +vpunpcklqdq %ymm13, %ymm11, %ymm3 +vpunpckhqdq %ymm13, %ymm11, %ymm13 +vpunpcklqdq %ymm0, %ymm12, %ymm11 +vpunpckhqdq %ymm0, %ymm12, %ymm0 +vpunpcklqdq %ymm2, %ymm14, %ymm12 +vpunpckhqdq %ymm2, %ymm14, %ymm2 +vpunpcklqdq %ymm1, %ymm15, %ymm14 +vpunpckhqdq %ymm1, %ymm15, %ymm1 +vinserti128 $1, %xmm3, %ymm4, %ymm15 +vmovdqa %ymm15, 160(%r12) +vinserti128 $1, %xmm13, %ymm5, %ymm15 +vmovdqa %ymm15, 352(%r12) +vinserti128 $1, %xmm11, %ymm6, %ymm15 +vmovdqa %ymm15, 544(%r12) +vinserti128 $1, %xmm0, %ymm7, %ymm15 +vmovdqa %ymm15, 736(%r12) +vinserti128 $1, %xmm12, %ymm8, %ymm15 +vmovdqa %ymm15, 928(%r12) +vinserti128 $1, %xmm2, %ymm9, %ymm15 +vmovdqa %ymm15, 1120(%r12) +vinserti128 $1, %xmm14, %ymm10, %ymm15 +vmovdqa %ymm15, 1312(%r12) +vpermq $78, %ymm4, %ymm4 +vpermq $78, %ymm5, %ymm5 +vpermq $78, %ymm6, %ymm6 +vpermq $78, %ymm7, %ymm7 +vpermq $78, %ymm8, %ymm8 +vpermq $78, %ymm9, %ymm9 +vpermq $78, %ymm10, %ymm10 +vinserti128 $0, %xmm4, %ymm3, %ymm15 +vmovdqa %ymm15, 1696(%r12) +vinserti128 $0, %xmm5, %ymm13, %ymm15 +vmovdqa %ymm15, 1888(%r12) +vinserti128 $0, %xmm6, %ymm11, %ymm15 +vmovdqa %ymm15, 2080(%r12) +vinserti128 $0, %xmm7, %ymm0, %ymm15 +vmovdqa %ymm15, 2272(%r12) +vinserti128 $0, %xmm8, %ymm12, %ymm15 +vmovdqa %ymm15, 2464(%r12) +vinserti128 $0, %xmm9, %ymm2, %ymm15 +vmovdqa %ymm15, 2656(%r12) +vinserti128 $0, %xmm10, %ymm14, %ymm15 +vmovdqa %ymm15, 2848(%r12) +vmovdqa 0(%rsp), %ymm11 +vinserti128 $1, %xmm1, %ymm11, %ymm14 +vmovdqa %ymm14, 1504(%r12) +vpermq $78, %ymm11, %ymm11 +vinserti128 $0, %xmm11, %ymm1, %ymm1 +vmovdqa %ymm1, 3040(%r12) +addq $32, %rsp +add $1536, %rax +add $1536, %r11 +add $3072, %r12 +dec %ecx +jnz karatsuba_loop_4eced63f144beffcb0247f9c6f67d165 +sub $12288, %r12 +add $9408, %rsp +subq $2400, %rsp +vpxor %ymm0, %ymm0, %ymm0 +vmovdqa %ymm0, 1792(%rsp) +vmovdqa %ymm0, 1824(%rsp) +vmovdqa %ymm0, 1856(%rsp) +vmovdqa %ymm0, 1888(%rsp) +vmovdqa %ymm0, 1920(%rsp) +vmovdqa %ymm0, 1952(%rsp) +vmovdqa %ymm0, 1984(%rsp) +vmovdqa %ymm0, 2016(%rsp) +vmovdqa %ymm0, 2048(%rsp) +vmovdqa %ymm0, 2080(%rsp) +vmovdqa %ymm0, 2112(%rsp) +vmovdqa %ymm0, 2144(%rsp) +vmovdqa %ymm0, 2176(%rsp) +vmovdqa %ymm0, 2208(%rsp) +vmovdqa %ymm0, 2240(%rsp) +vmovdqa %ymm0, 2272(%rsp) +vmovdqa %ymm0, 2304(%rsp) +vmovdqa %ymm0, 2336(%rsp) +vmovdqa %ymm0, 2368(%rsp) +vmovdqa %ymm0, 2400(%rsp) +vmovdqa %ymm0, 2432(%rsp) +vmovdqa %ymm0, 2464(%rsp) +vmovdqa %ymm0, 2496(%rsp) +vmovdqa %ymm0, 2528(%rsp) +vmovdqa %ymm0, 2560(%rsp) +vmovdqa %ymm0, 2592(%rsp) +vmovdqa %ymm0, 2624(%rsp) +vmovdqa %ymm0, 2656(%rsp) +vmovdqa %ymm0, 2688(%rsp) +vmovdqa %ymm0, 2720(%rsp) +vmovdqa %ymm0, 2752(%rsp) +vmovdqa %ymm0, 2784(%rsp) +vmovdqa const729(%rip), %ymm15 +vmovdqa const3_inv(%rip), %ymm14 +vmovdqa const5_inv(%rip), %ymm13 +vmovdqa const9(%rip), %ymm12 +vmovdqa 96(%r12), %ymm0 +vpsubw 192(%r12), %ymm0, %ymm0 +vmovdqa 480(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 288(%r12), %ymm1, %ymm1 +vpsubw 0(%r12), %ymm0, %ymm0 +vpaddw 384(%r12), %ymm0, %ymm0 +vmovdqa 672(%r12), %ymm2 +vpsubw 768(%r12), %ymm2, %ymm2 +vmovdqa 1056(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 864(%r12), %ymm3, %ymm3 +vpsubw 576(%r12), %ymm2, %ymm2 +vpaddw 960(%r12), %ymm2, %ymm2 +vmovdqa 1248(%r12), %ymm4 +vpsubw 1344(%r12), %ymm4, %ymm4 +vmovdqa 1632(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 1440(%r12), %ymm5, %ymm5 +vpsubw 1152(%r12), %ymm4, %ymm4 +vpaddw 1536(%r12), %ymm4, %ymm4 +vpsubw 576(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 0(%r12), %ymm1, %ymm1 +vpaddw 1152(%r12), %ymm1, %ymm1 +vmovdqa 288(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 1440(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 864(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 0(%r12), %ymm8 +vmovdqa 864(%r12), %ymm9 +vmovdqa %ymm8, 0(%rsp) +vmovdqa %ymm0, 32(%rsp) +vmovdqa %ymm1, 64(%rsp) +vmovdqa %ymm7, 96(%rsp) +vmovdqa %ymm5, 128(%rsp) +vmovdqa %ymm2, 160(%rsp) +vmovdqa %ymm3, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa 1824(%r12), %ymm0 +vpsubw 1920(%r12), %ymm0, %ymm0 +vmovdqa 2208(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 2016(%r12), %ymm1, %ymm1 +vpsubw 1728(%r12), %ymm0, %ymm0 +vpaddw 2112(%r12), %ymm0, %ymm0 +vmovdqa 2400(%r12), %ymm2 +vpsubw 2496(%r12), %ymm2, %ymm2 +vmovdqa 2784(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 2592(%r12), %ymm3, %ymm3 +vpsubw 2304(%r12), %ymm2, %ymm2 +vpaddw 2688(%r12), %ymm2, %ymm2 +vmovdqa 2976(%r12), %ymm4 +vpsubw 3072(%r12), %ymm4, %ymm4 +vmovdqa 3360(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 3168(%r12), %ymm5, %ymm5 +vpsubw 2880(%r12), %ymm4, %ymm4 +vpaddw 3264(%r12), %ymm4, %ymm4 +vpsubw 2304(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 1728(%r12), %ymm1, %ymm1 +vpaddw 2880(%r12), %ymm1, %ymm1 +vmovdqa 2016(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 3168(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 2592(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 1728(%r12), %ymm8 +vmovdqa 2592(%r12), %ymm9 +vmovdqa %ymm8, 256(%rsp) +vmovdqa %ymm0, 288(%rsp) +vmovdqa %ymm1, 320(%rsp) +vmovdqa %ymm7, 352(%rsp) +vmovdqa %ymm5, 384(%rsp) +vmovdqa %ymm2, 416(%rsp) +vmovdqa %ymm3, 448(%rsp) +vmovdqa %ymm9, 480(%rsp) +vmovdqa 3552(%r12), %ymm0 +vpsubw 3648(%r12), %ymm0, %ymm0 +vmovdqa 3936(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3744(%r12), %ymm1, %ymm1 +vpsubw 3456(%r12), %ymm0, %ymm0 +vpaddw 3840(%r12), %ymm0, %ymm0 +vmovdqa 4128(%r12), %ymm2 +vpsubw 4224(%r12), %ymm2, %ymm2 +vmovdqa 4512(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 4320(%r12), %ymm3, %ymm3 +vpsubw 4032(%r12), %ymm2, %ymm2 +vpaddw 4416(%r12), %ymm2, %ymm2 +vmovdqa 4704(%r12), %ymm4 +vpsubw 4800(%r12), %ymm4, %ymm4 +vmovdqa 5088(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 4896(%r12), %ymm5, %ymm5 +vpsubw 4608(%r12), %ymm4, %ymm4 +vpaddw 4992(%r12), %ymm4, %ymm4 +vpsubw 4032(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 3456(%r12), %ymm1, %ymm1 +vpaddw 4608(%r12), %ymm1, %ymm1 +vmovdqa 3744(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 4896(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 4320(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 3456(%r12), %ymm8 +vmovdqa 4320(%r12), %ymm9 +vmovdqa %ymm8, 512(%rsp) +vmovdqa %ymm0, 544(%rsp) +vmovdqa %ymm1, 576(%rsp) +vmovdqa %ymm7, 608(%rsp) +vmovdqa %ymm5, 640(%rsp) +vmovdqa %ymm2, 672(%rsp) +vmovdqa %ymm3, 704(%rsp) +vmovdqa %ymm9, 736(%rsp) +vmovdqa 5280(%r12), %ymm0 +vpsubw 5376(%r12), %ymm0, %ymm0 +vmovdqa 5664(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5472(%r12), %ymm1, %ymm1 +vpsubw 5184(%r12), %ymm0, %ymm0 +vpaddw 5568(%r12), %ymm0, %ymm0 +vmovdqa 5856(%r12), %ymm2 +vpsubw 5952(%r12), %ymm2, %ymm2 +vmovdqa 6240(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 6048(%r12), %ymm3, %ymm3 +vpsubw 5760(%r12), %ymm2, %ymm2 +vpaddw 6144(%r12), %ymm2, %ymm2 +vmovdqa 6432(%r12), %ymm4 +vpsubw 6528(%r12), %ymm4, %ymm4 +vmovdqa 6816(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 6624(%r12), %ymm5, %ymm5 +vpsubw 6336(%r12), %ymm4, %ymm4 +vpaddw 6720(%r12), %ymm4, %ymm4 +vpsubw 5760(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 5184(%r12), %ymm1, %ymm1 +vpaddw 6336(%r12), %ymm1, %ymm1 +vmovdqa 5472(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 6624(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 6048(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 5184(%r12), %ymm8 +vmovdqa 6048(%r12), %ymm9 +vmovdqa %ymm8, 768(%rsp) +vmovdqa %ymm0, 800(%rsp) +vmovdqa %ymm1, 832(%rsp) +vmovdqa %ymm7, 864(%rsp) +vmovdqa %ymm5, 896(%rsp) +vmovdqa %ymm2, 928(%rsp) +vmovdqa %ymm3, 960(%rsp) +vmovdqa %ymm9, 992(%rsp) +vmovdqa 7008(%r12), %ymm0 +vpsubw 7104(%r12), %ymm0, %ymm0 +vmovdqa 7392(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 7200(%r12), %ymm1, %ymm1 +vpsubw 6912(%r12), %ymm0, %ymm0 +vpaddw 7296(%r12), %ymm0, %ymm0 +vmovdqa 7584(%r12), %ymm2 +vpsubw 7680(%r12), %ymm2, %ymm2 +vmovdqa 7968(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 7776(%r12), %ymm3, %ymm3 +vpsubw 7488(%r12), %ymm2, %ymm2 +vpaddw 7872(%r12), %ymm2, %ymm2 +vmovdqa 8160(%r12), %ymm4 +vpsubw 8256(%r12), %ymm4, %ymm4 +vmovdqa 8544(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 8352(%r12), %ymm5, %ymm5 +vpsubw 8064(%r12), %ymm4, %ymm4 +vpaddw 8448(%r12), %ymm4, %ymm4 +vpsubw 7488(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 6912(%r12), %ymm1, %ymm1 +vpaddw 8064(%r12), %ymm1, %ymm1 +vmovdqa 7200(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 8352(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 7776(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 6912(%r12), %ymm8 +vmovdqa 7776(%r12), %ymm9 +vmovdqa %ymm8, 1024(%rsp) +vmovdqa %ymm0, 1056(%rsp) +vmovdqa %ymm1, 1088(%rsp) +vmovdqa %ymm7, 1120(%rsp) +vmovdqa %ymm5, 1152(%rsp) +vmovdqa %ymm2, 1184(%rsp) +vmovdqa %ymm3, 1216(%rsp) +vmovdqa %ymm9, 1248(%rsp) +vmovdqa 8736(%r12), %ymm0 +vpsubw 8832(%r12), %ymm0, %ymm0 +vmovdqa 9120(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 8928(%r12), %ymm1, %ymm1 +vpsubw 8640(%r12), %ymm0, %ymm0 +vpaddw 9024(%r12), %ymm0, %ymm0 +vmovdqa 9312(%r12), %ymm2 +vpsubw 9408(%r12), %ymm2, %ymm2 +vmovdqa 9696(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 9504(%r12), %ymm3, %ymm3 +vpsubw 9216(%r12), %ymm2, %ymm2 +vpaddw 9600(%r12), %ymm2, %ymm2 +vmovdqa 9888(%r12), %ymm4 +vpsubw 9984(%r12), %ymm4, %ymm4 +vmovdqa 10272(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 10080(%r12), %ymm5, %ymm5 +vpsubw 9792(%r12), %ymm4, %ymm4 +vpaddw 10176(%r12), %ymm4, %ymm4 +vpsubw 9216(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 8640(%r12), %ymm1, %ymm1 +vpaddw 9792(%r12), %ymm1, %ymm1 +vmovdqa 8928(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 10080(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 9504(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 8640(%r12), %ymm8 +vmovdqa 9504(%r12), %ymm9 +vmovdqa %ymm8, 1280(%rsp) +vmovdqa %ymm0, 1312(%rsp) +vmovdqa %ymm1, 1344(%rsp) +vmovdqa %ymm7, 1376(%rsp) +vmovdqa %ymm5, 1408(%rsp) +vmovdqa %ymm2, 1440(%rsp) +vmovdqa %ymm3, 1472(%rsp) +vmovdqa %ymm9, 1504(%rsp) +vmovdqa 10464(%r12), %ymm0 +vpsubw 10560(%r12), %ymm0, %ymm0 +vmovdqa 10848(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 10656(%r12), %ymm1, %ymm1 +vpsubw 10368(%r12), %ymm0, %ymm0 +vpaddw 10752(%r12), %ymm0, %ymm0 +vmovdqa 11040(%r12), %ymm2 +vpsubw 11136(%r12), %ymm2, %ymm2 +vmovdqa 11424(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 11232(%r12), %ymm3, %ymm3 +vpsubw 10944(%r12), %ymm2, %ymm2 +vpaddw 11328(%r12), %ymm2, %ymm2 +vmovdqa 11616(%r12), %ymm4 +vpsubw 11712(%r12), %ymm4, %ymm4 +vmovdqa 12000(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 11808(%r12), %ymm5, %ymm5 +vpsubw 11520(%r12), %ymm4, %ymm4 +vpaddw 11904(%r12), %ymm4, %ymm4 +vpsubw 10944(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 10368(%r12), %ymm1, %ymm1 +vpaddw 11520(%r12), %ymm1, %ymm1 +vmovdqa 10656(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 11808(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 11232(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 10368(%r12), %ymm8 +vmovdqa 11232(%r12), %ymm9 +vmovdqa %ymm8, 1536(%rsp) +vmovdqa %ymm0, 1568(%rsp) +vmovdqa %ymm1, 1600(%rsp) +vmovdqa %ymm7, 1632(%rsp) +vmovdqa %ymm5, 1664(%rsp) +vmovdqa %ymm2, 1696(%rsp) +vmovdqa %ymm3, 1728(%rsp) +vmovdqa %ymm9, 1760(%rsp) +vmovdqa 0(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm9, %ymm9 +vmovdqa 256(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm7 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 512(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm7, %ymm4 +vpaddd %ymm6, %ymm8, %ymm3 +vpsubd %ymm10, %ymm4, %ymm4 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm5, %ymm7, %ymm5 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1536(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm8 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm7, %ymm3, %ymm3 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm4, %ymm3 +vmovdqa 768(%rsp), %ymm4 +vpaddw 1024(%rsp), %ymm4, %ymm7 +vpsubw 1024(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm11, %ymm8 +vpsubw %ymm8, %ymm7, %ymm8 +vpsllw $7, %ymm5, %ymm7 +vpsubw %ymm7, %ymm8, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm3, %ymm7, %ymm7 +vmovdqa 1280(%rsp), %ymm8 +vpsubw %ymm11, %ymm8, %ymm8 +vpmullw %ymm15, %ymm5, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm3 +vpmullw %ymm12, %ymm7, %ymm8 +vpaddw %ymm8, %ymm3, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm9 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm10 +vpor %ymm10, %ymm7, %ymm7 +vpaddw %ymm7, %ymm11, %ymm11 +vmovdqa %xmm9, 2048(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm9 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm10 +vpor %ymm10, %ymm8, %ymm8 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm9, 2304(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm9 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm10 +vpor %ymm10, %ymm5, %ymm5 +vpaddw %ymm5, %ymm3, %ymm3 +vmovdqa %xmm9, 2560(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 0(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 352(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 704(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %ymm4, 1056(%rdi) +vmovdqa 32(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm8 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm7, %ymm7 +vmovdqa 288(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm3 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 544(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm3, %ymm9 +vpaddd %ymm6, %ymm4, %ymm10 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm11, %ymm3, %ymm11 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1568(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm3, %ymm10, %ymm10 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpackusdw %ymm10, %ymm9, %ymm10 +vmovdqa 800(%rsp), %ymm9 +vpaddw 1056(%rsp), %ymm9, %ymm3 +vpsubw 1056(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm5, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpsllw $7, %ymm11, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vmovdqa 1312(%rsp), %ymm4 +vpsubw %ymm5, %ymm4, %ymm4 +vpmullw %ymm15, %ymm11, %ymm7 +vpsubw %ymm7, %ymm4, %ymm7 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpmullw %ymm12, %ymm3, %ymm4 +vpaddw %ymm4, %ymm10, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm7, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm7 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm8 +vpor %ymm8, %ymm3, %ymm3 +vpaddw %ymm3, %ymm5, %ymm5 +vmovdqa %xmm7, 2080(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm7 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm7, 2336(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm7 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm8 +vpor %ymm8, %ymm11, %ymm11 +vpaddw %ymm11, %ymm10, %ymm10 +vmovdqa %xmm7, 2592(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 88(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 440(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 792(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1144(%rdi) +vmovdqa 64(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm3, %ymm3 +vmovdqa 320(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm10 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 576(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm10, %ymm7 +vpaddd %ymm6, %ymm9, %ymm8 +vpsubd %ymm4, %ymm7, %ymm7 +vpsubd %ymm3, %ymm8, %ymm8 +vpsubd %ymm5, %ymm10, %ymm5 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1600(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpsubd %ymm9, %ymm7, %ymm7 +vpsubd %ymm10, %ymm8, %ymm8 +vpsrld $1, %ymm7, %ymm7 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm7, %ymm8 +vmovdqa 832(%rsp), %ymm7 +vpaddw 1088(%rsp), %ymm7, %ymm10 +vpsubw 1088(%rsp), %ymm7, %ymm7 +vpsrlw $2, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsllw $1, %ymm11, %ymm9 +vpsubw %ymm9, %ymm10, %ymm9 +vpsllw $7, %ymm5, %ymm10 +vpsubw %ymm10, %ymm9, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm8, %ymm10, %ymm10 +vmovdqa 1344(%rsp), %ymm9 +vpsubw %ymm11, %ymm9, %ymm9 +vpmullw %ymm15, %ymm5, %ymm3 +vpsubw %ymm3, %ymm9, %ymm3 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm10, %ymm8, %ymm8 +vpmullw %ymm12, %ymm10, %ymm9 +vpaddw %ymm9, %ymm8, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm7, %ymm9, %ymm9 +vpsubw %ymm9, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm3 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm10, %ymm10 +vpaddw %ymm10, %ymm11, %ymm11 +vmovdqa %xmm3, 2112(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm3 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm3, 2368(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm3 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm5, %ymm5 +vpaddw %ymm5, %ymm8, %ymm8 +vmovdqa %xmm3, 2624(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 176(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 528(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 880(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 1232(%rdi) +vmovdqa 96(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm10, %ymm10 +vmovdqa 352(%rsp), %ymm7 +vpunpcklwd const0(%rip), %ymm7, %ymm8 +vpunpckhwd const0(%rip), %ymm7, %ymm7 +vmovdqa 608(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm8, %ymm3 +vpaddd %ymm6, %ymm7, %ymm4 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm10, %ymm4, %ymm4 +vpsubd %ymm11, %ymm8, %ymm11 +vpsubd %ymm6, %ymm7, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1632(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm7 +vpunpckhwd const0(%rip), %ymm11, %ymm8 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm7, %ymm3, %ymm3 +vpsubd %ymm8, %ymm4, %ymm4 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm3, %ymm4 +vmovdqa 864(%rsp), %ymm3 +vpaddw 1120(%rsp), %ymm3, %ymm8 +vpsubw 1120(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm5, %ymm7 +vpsubw %ymm7, %ymm8, %ymm7 +vpsllw $7, %ymm11, %ymm8 +vpsubw %ymm8, %ymm7, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vmovdqa 1376(%rsp), %ymm7 +vpsubw %ymm5, %ymm7, %ymm7 +vpmullw %ymm15, %ymm11, %ymm10 +vpsubw %ymm10, %ymm7, %ymm10 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpmullw %ymm12, %ymm8, %ymm7 +vpaddw %ymm7, %ymm4, %ymm7 +vpmullw %ymm12, %ymm7, %ymm7 +vpsubw %ymm7, %ymm10, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm3, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm10 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm9 +vpor %ymm9, %ymm8, %ymm8 +vpaddw %ymm8, %ymm5, %ymm5 +vmovdqa %xmm10, 2144(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm10 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm9 +vpor %ymm9, %ymm7, %ymm7 +vpaddw %ymm7, %ymm6, %ymm6 +vmovdqa %xmm10, 2400(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm10 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm9 +vpor %ymm9, %ymm11, %ymm11 +vpaddw %ymm11, %ymm4, %ymm4 +vmovdqa %xmm10, 2656(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 264(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 616(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %ymm4, 968(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1320(%rdi) +vmovdqa 128(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm7 +vpunpckhwd const0(%rip), %ymm11, %ymm8 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm8, %ymm8 +vmovdqa 384(%rsp), %ymm3 +vpunpcklwd const0(%rip), %ymm3, %ymm4 +vpunpckhwd const0(%rip), %ymm3, %ymm3 +vmovdqa 640(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm4, %ymm10 +vpaddd %ymm6, %ymm3, %ymm9 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm5, %ymm4, %ymm5 +vpsubd %ymm6, %ymm3, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1664(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm4, %ymm4 +vpsubd %ymm3, %ymm10, %ymm10 +vpsubd %ymm4, %ymm9, %ymm9 +vpsrld $1, %ymm10, %ymm10 +vpsrld $1, %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpackusdw %ymm9, %ymm10, %ymm9 +vmovdqa 896(%rsp), %ymm10 +vpaddw 1152(%rsp), %ymm10, %ymm4 +vpsubw 1152(%rsp), %ymm10, %ymm10 +vpsrlw $2, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsllw $1, %ymm11, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpsllw $7, %ymm5, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vmovdqa 1408(%rsp), %ymm3 +vpsubw %ymm11, %ymm3, %ymm3 +vpmullw %ymm15, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpmullw %ymm12, %ymm4, %ymm3 +vpaddw %ymm3, %ymm9, %ymm3 +vpmullw %ymm12, %ymm3, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpmullw %ymm13, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vmovdqu 352(%rdi), %ymm8 +vmovdqu 704(%rdi), %ymm7 +vmovdqu 1056(%rdi), %ymm2 +vpaddw %ymm11, %ymm8, %ymm11 +vpaddw %ymm6, %ymm7, %ymm6 +vpaddw %ymm9, %ymm2, %ymm9 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm2 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm10, %ymm10 +vmovdqu 0(%rdi), %ymm7 +vpaddw %ymm10, %ymm7, %ymm7 +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 0(%rdi) +vmovdqa %xmm2, 1920(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm2 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm4, %ymm4 +vpaddw %ymm4, %ymm11, %ymm11 +vmovdqa %xmm2, 2176(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm2 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm3, %ymm3 +vpaddw %ymm3, %ymm6, %ymm6 +vmovdqa %xmm2, 2432(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm2 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm7 +vpor %ymm7, %ymm5, %ymm5 +vpaddw %ymm5, %ymm9, %ymm9 +vmovdqa %xmm2, 2688(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 352(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 704(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1056(%rdi) +vmovdqa 160(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm4, %ymm4 +vmovdqa 416(%rsp), %ymm10 +vpunpcklwd const0(%rip), %ymm10, %ymm9 +vpunpckhwd const0(%rip), %ymm10, %ymm10 +vmovdqa 672(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm9, %ymm2 +vpaddd %ymm6, %ymm10, %ymm7 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm4, %ymm7, %ymm7 +vpsubd %ymm11, %ymm9, %ymm11 +vpsubd %ymm6, %ymm10, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1696(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm9, %ymm9 +vpsubd %ymm10, %ymm2, %ymm2 +vpsubd %ymm9, %ymm7, %ymm7 +vpsrld $1, %ymm2, %ymm2 +vpsrld $1, %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpackusdw %ymm7, %ymm2, %ymm7 +vmovdqa 928(%rsp), %ymm2 +vpaddw 1184(%rsp), %ymm2, %ymm9 +vpsubw 1184(%rsp), %ymm2, %ymm2 +vpsrlw $2, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsllw $1, %ymm5, %ymm10 +vpsubw %ymm10, %ymm9, %ymm10 +vpsllw $7, %ymm11, %ymm9 +vpsubw %ymm9, %ymm10, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm7, %ymm9, %ymm9 +vmovdqa 1440(%rsp), %ymm10 +vpsubw %ymm5, %ymm10, %ymm10 +vpmullw %ymm15, %ymm11, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm9, %ymm7, %ymm7 +vpmullw %ymm12, %ymm9, %ymm10 +vpaddw %ymm10, %ymm7, %ymm10 +vpmullw %ymm12, %ymm10, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpmullw %ymm13, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vmovdqu 440(%rdi), %ymm4 +vmovdqu 792(%rdi), %ymm3 +vmovdqu 1144(%rdi), %ymm8 +vpaddw %ymm5, %ymm4, %ymm5 +vpaddw %ymm6, %ymm3, %ymm6 +vpaddw %ymm7, %ymm8, %ymm7 +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm8 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm2, %ymm2 +vmovdqu 88(%rdi), %ymm3 +vpaddw %ymm2, %ymm3, %ymm3 +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 88(%rdi) +vmovdqa %xmm8, 1952(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm8 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm9, %ymm9 +vpaddw %ymm9, %ymm5, %ymm5 +vmovdqa %xmm8, 2208(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm8 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm10, %ymm10 +vpaddw %ymm10, %ymm6, %ymm6 +vmovdqa %xmm8, 2464(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm8 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm11, %ymm11 +vpaddw %ymm11, %ymm7, %ymm7 +vmovdqa %xmm8, 2720(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 440(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 792(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 1144(%rdi) +vmovdqa 192(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm9, %ymm9 +vmovdqa 448(%rsp), %ymm2 +vpunpcklwd const0(%rip), %ymm2, %ymm7 +vpunpckhwd const0(%rip), %ymm2, %ymm2 +vmovdqa 704(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm7, %ymm8 +vpaddd %ymm6, %ymm2, %ymm3 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm5, %ymm7, %ymm5 +vpsubd %ymm6, %ymm2, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1728(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm2, %ymm8, %ymm8 +vpsubd %ymm7, %ymm3, %ymm3 +vpsrld $1, %ymm8, %ymm8 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm8, %ymm3 +vmovdqa 960(%rsp), %ymm8 +vpaddw 1216(%rsp), %ymm8, %ymm7 +vpsubw 1216(%rsp), %ymm8, %ymm8 +vpsrlw $2, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsllw $1, %ymm11, %ymm2 +vpsubw %ymm2, %ymm7, %ymm2 +vpsllw $7, %ymm5, %ymm7 +vpsubw %ymm7, %ymm2, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm3, %ymm7, %ymm7 +vmovdqa 1472(%rsp), %ymm2 +vpsubw %ymm11, %ymm2, %ymm2 +vpmullw %ymm15, %ymm5, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm3 +vpmullw %ymm12, %ymm7, %ymm2 +vpaddw %ymm2, %ymm3, %ymm2 +vpmullw %ymm12, %ymm2, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpmullw %ymm13, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vmovdqu 528(%rdi), %ymm9 +vmovdqu 880(%rdi), %ymm10 +vmovdqu 1232(%rdi), %ymm4 +vpaddw %ymm11, %ymm9, %ymm11 +vpaddw %ymm6, %ymm10, %ymm6 +vpaddw %ymm3, %ymm4, %ymm3 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm4 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm8, %ymm8 +vmovdqu 176(%rdi), %ymm10 +vpaddw %ymm8, %ymm10, %ymm10 +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 176(%rdi) +vmovdqa %xmm4, 1984(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm4 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm7, %ymm7 +vpaddw %ymm7, %ymm11, %ymm11 +vmovdqa %xmm4, 2240(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm4 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm2, %ymm2 +vpaddw %ymm2, %ymm6, %ymm6 +vmovdqa %xmm4, 2496(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm4 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm10 +vpor %ymm10, %ymm5, %ymm5 +vpaddw %ymm5, %ymm3, %ymm3 +vmovdqa %xmm4, 2752(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 528(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 880(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1232(%rdi) +vmovdqa 224(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm7 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm7, %ymm7 +vmovdqa 480(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm3 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 736(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm3, %ymm4 +vpaddd %ymm6, %ymm8, %ymm10 +vpsubd %ymm2, %ymm4, %ymm4 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm11, %ymm3, %ymm11 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1760(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm3, %ymm10, %ymm10 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpackusdw %ymm10, %ymm4, %ymm10 +vmovdqa 992(%rsp), %ymm4 +vpaddw 1248(%rsp), %ymm4, %ymm3 +vpsubw 1248(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpsllw $7, %ymm11, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vmovdqa 1504(%rsp), %ymm8 +vpsubw %ymm5, %ymm8, %ymm8 +vpmullw %ymm15, %ymm11, %ymm7 +vpsubw %ymm7, %ymm8, %ymm7 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpmullw %ymm12, %ymm3, %ymm8 +vpaddw %ymm8, %ymm10, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vmovdqu 616(%rdi), %ymm7 +vmovdqu 968(%rdi), %ymm2 +vmovdqu 1320(%rdi), %ymm9 +vpaddw %ymm5, %ymm7, %ymm5 +vpaddw %ymm6, %ymm2, %ymm6 +vpaddw %ymm10, %ymm9, %ymm10 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm9 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm4, %ymm4 +vmovdqu 264(%rdi), %ymm2 +vpaddw %ymm4, %ymm2, %ymm2 +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 264(%rdi) +vmovdqa %xmm9, 2016(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm9 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm3, %ymm3 +vpaddw %ymm3, %ymm5, %ymm5 +vmovdqa %xmm9, 2272(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm9 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm8, %ymm8 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm9, 2528(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm9 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm2 +vpor %ymm2, %ymm11, %ymm11 +vpaddw %ymm11, %ymm10, %ymm10 +vmovdqa %xmm9, 2784(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 616(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 968(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 1320(%rdi) +vmovdqa 128(%r12), %ymm0 +vpsubw 224(%r12), %ymm0, %ymm0 +vmovdqa 512(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 320(%r12), %ymm1, %ymm1 +vpsubw 32(%r12), %ymm0, %ymm0 +vpaddw 416(%r12), %ymm0, %ymm0 +vmovdqa 704(%r12), %ymm2 +vpsubw 800(%r12), %ymm2, %ymm2 +vmovdqa 1088(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 896(%r12), %ymm3, %ymm3 +vpsubw 608(%r12), %ymm2, %ymm2 +vpaddw 992(%r12), %ymm2, %ymm2 +vmovdqa 1280(%r12), %ymm4 +vpsubw 1376(%r12), %ymm4, %ymm4 +vmovdqa 1664(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 1472(%r12), %ymm5, %ymm5 +vpsubw 1184(%r12), %ymm4, %ymm4 +vpaddw 1568(%r12), %ymm4, %ymm4 +vpsubw 608(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 32(%r12), %ymm1, %ymm1 +vpaddw 1184(%r12), %ymm1, %ymm1 +vmovdqa 320(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 1472(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 896(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 32(%r12), %ymm8 +vmovdqa 896(%r12), %ymm9 +vmovdqa %ymm8, 0(%rsp) +vmovdqa %ymm0, 32(%rsp) +vmovdqa %ymm1, 64(%rsp) +vmovdqa %ymm7, 96(%rsp) +vmovdqa %ymm5, 128(%rsp) +vmovdqa %ymm2, 160(%rsp) +vmovdqa %ymm3, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa 1856(%r12), %ymm0 +vpsubw 1952(%r12), %ymm0, %ymm0 +vmovdqa 2240(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 2048(%r12), %ymm1, %ymm1 +vpsubw 1760(%r12), %ymm0, %ymm0 +vpaddw 2144(%r12), %ymm0, %ymm0 +vmovdqa 2432(%r12), %ymm2 +vpsubw 2528(%r12), %ymm2, %ymm2 +vmovdqa 2816(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 2624(%r12), %ymm3, %ymm3 +vpsubw 2336(%r12), %ymm2, %ymm2 +vpaddw 2720(%r12), %ymm2, %ymm2 +vmovdqa 3008(%r12), %ymm4 +vpsubw 3104(%r12), %ymm4, %ymm4 +vmovdqa 3392(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 3200(%r12), %ymm5, %ymm5 +vpsubw 2912(%r12), %ymm4, %ymm4 +vpaddw 3296(%r12), %ymm4, %ymm4 +vpsubw 2336(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 1760(%r12), %ymm1, %ymm1 +vpaddw 2912(%r12), %ymm1, %ymm1 +vmovdqa 2048(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 3200(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 2624(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 1760(%r12), %ymm8 +vmovdqa 2624(%r12), %ymm9 +vmovdqa %ymm8, 256(%rsp) +vmovdqa %ymm0, 288(%rsp) +vmovdqa %ymm1, 320(%rsp) +vmovdqa %ymm7, 352(%rsp) +vmovdqa %ymm5, 384(%rsp) +vmovdqa %ymm2, 416(%rsp) +vmovdqa %ymm3, 448(%rsp) +vmovdqa %ymm9, 480(%rsp) +vmovdqa 3584(%r12), %ymm0 +vpsubw 3680(%r12), %ymm0, %ymm0 +vmovdqa 3968(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3776(%r12), %ymm1, %ymm1 +vpsubw 3488(%r12), %ymm0, %ymm0 +vpaddw 3872(%r12), %ymm0, %ymm0 +vmovdqa 4160(%r12), %ymm2 +vpsubw 4256(%r12), %ymm2, %ymm2 +vmovdqa 4544(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 4352(%r12), %ymm3, %ymm3 +vpsubw 4064(%r12), %ymm2, %ymm2 +vpaddw 4448(%r12), %ymm2, %ymm2 +vmovdqa 4736(%r12), %ymm4 +vpsubw 4832(%r12), %ymm4, %ymm4 +vmovdqa 5120(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 4928(%r12), %ymm5, %ymm5 +vpsubw 4640(%r12), %ymm4, %ymm4 +vpaddw 5024(%r12), %ymm4, %ymm4 +vpsubw 4064(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 3488(%r12), %ymm1, %ymm1 +vpaddw 4640(%r12), %ymm1, %ymm1 +vmovdqa 3776(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 4928(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 4352(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 3488(%r12), %ymm8 +vmovdqa 4352(%r12), %ymm9 +vmovdqa %ymm8, 512(%rsp) +vmovdqa %ymm0, 544(%rsp) +vmovdqa %ymm1, 576(%rsp) +vmovdqa %ymm7, 608(%rsp) +vmovdqa %ymm5, 640(%rsp) +vmovdqa %ymm2, 672(%rsp) +vmovdqa %ymm3, 704(%rsp) +vmovdqa %ymm9, 736(%rsp) +vmovdqa 5312(%r12), %ymm0 +vpsubw 5408(%r12), %ymm0, %ymm0 +vmovdqa 5696(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5504(%r12), %ymm1, %ymm1 +vpsubw 5216(%r12), %ymm0, %ymm0 +vpaddw 5600(%r12), %ymm0, %ymm0 +vmovdqa 5888(%r12), %ymm2 +vpsubw 5984(%r12), %ymm2, %ymm2 +vmovdqa 6272(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 6080(%r12), %ymm3, %ymm3 +vpsubw 5792(%r12), %ymm2, %ymm2 +vpaddw 6176(%r12), %ymm2, %ymm2 +vmovdqa 6464(%r12), %ymm4 +vpsubw 6560(%r12), %ymm4, %ymm4 +vmovdqa 6848(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 6656(%r12), %ymm5, %ymm5 +vpsubw 6368(%r12), %ymm4, %ymm4 +vpaddw 6752(%r12), %ymm4, %ymm4 +vpsubw 5792(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 5216(%r12), %ymm1, %ymm1 +vpaddw 6368(%r12), %ymm1, %ymm1 +vmovdqa 5504(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 6656(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 6080(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 5216(%r12), %ymm8 +vmovdqa 6080(%r12), %ymm9 +vmovdqa %ymm8, 768(%rsp) +vmovdqa %ymm0, 800(%rsp) +vmovdqa %ymm1, 832(%rsp) +vmovdqa %ymm7, 864(%rsp) +vmovdqa %ymm5, 896(%rsp) +vmovdqa %ymm2, 928(%rsp) +vmovdqa %ymm3, 960(%rsp) +vmovdqa %ymm9, 992(%rsp) +vmovdqa 7040(%r12), %ymm0 +vpsubw 7136(%r12), %ymm0, %ymm0 +vmovdqa 7424(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 7232(%r12), %ymm1, %ymm1 +vpsubw 6944(%r12), %ymm0, %ymm0 +vpaddw 7328(%r12), %ymm0, %ymm0 +vmovdqa 7616(%r12), %ymm2 +vpsubw 7712(%r12), %ymm2, %ymm2 +vmovdqa 8000(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 7808(%r12), %ymm3, %ymm3 +vpsubw 7520(%r12), %ymm2, %ymm2 +vpaddw 7904(%r12), %ymm2, %ymm2 +vmovdqa 8192(%r12), %ymm4 +vpsubw 8288(%r12), %ymm4, %ymm4 +vmovdqa 8576(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 8384(%r12), %ymm5, %ymm5 +vpsubw 8096(%r12), %ymm4, %ymm4 +vpaddw 8480(%r12), %ymm4, %ymm4 +vpsubw 7520(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 6944(%r12), %ymm1, %ymm1 +vpaddw 8096(%r12), %ymm1, %ymm1 +vmovdqa 7232(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 8384(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 7808(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 6944(%r12), %ymm8 +vmovdqa 7808(%r12), %ymm9 +vmovdqa %ymm8, 1024(%rsp) +vmovdqa %ymm0, 1056(%rsp) +vmovdqa %ymm1, 1088(%rsp) +vmovdqa %ymm7, 1120(%rsp) +vmovdqa %ymm5, 1152(%rsp) +vmovdqa %ymm2, 1184(%rsp) +vmovdqa %ymm3, 1216(%rsp) +vmovdqa %ymm9, 1248(%rsp) +vmovdqa 8768(%r12), %ymm0 +vpsubw 8864(%r12), %ymm0, %ymm0 +vmovdqa 9152(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 8960(%r12), %ymm1, %ymm1 +vpsubw 8672(%r12), %ymm0, %ymm0 +vpaddw 9056(%r12), %ymm0, %ymm0 +vmovdqa 9344(%r12), %ymm2 +vpsubw 9440(%r12), %ymm2, %ymm2 +vmovdqa 9728(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 9536(%r12), %ymm3, %ymm3 +vpsubw 9248(%r12), %ymm2, %ymm2 +vpaddw 9632(%r12), %ymm2, %ymm2 +vmovdqa 9920(%r12), %ymm4 +vpsubw 10016(%r12), %ymm4, %ymm4 +vmovdqa 10304(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 10112(%r12), %ymm5, %ymm5 +vpsubw 9824(%r12), %ymm4, %ymm4 +vpaddw 10208(%r12), %ymm4, %ymm4 +vpsubw 9248(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 8672(%r12), %ymm1, %ymm1 +vpaddw 9824(%r12), %ymm1, %ymm1 +vmovdqa 8960(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 10112(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 9536(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 8672(%r12), %ymm8 +vmovdqa 9536(%r12), %ymm9 +vmovdqa %ymm8, 1280(%rsp) +vmovdqa %ymm0, 1312(%rsp) +vmovdqa %ymm1, 1344(%rsp) +vmovdqa %ymm7, 1376(%rsp) +vmovdqa %ymm5, 1408(%rsp) +vmovdqa %ymm2, 1440(%rsp) +vmovdqa %ymm3, 1472(%rsp) +vmovdqa %ymm9, 1504(%rsp) +vmovdqa 10496(%r12), %ymm0 +vpsubw 10592(%r12), %ymm0, %ymm0 +vmovdqa 10880(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 10688(%r12), %ymm1, %ymm1 +vpsubw 10400(%r12), %ymm0, %ymm0 +vpaddw 10784(%r12), %ymm0, %ymm0 +vmovdqa 11072(%r12), %ymm2 +vpsubw 11168(%r12), %ymm2, %ymm2 +vmovdqa 11456(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 11264(%r12), %ymm3, %ymm3 +vpsubw 10976(%r12), %ymm2, %ymm2 +vpaddw 11360(%r12), %ymm2, %ymm2 +vmovdqa 11648(%r12), %ymm4 +vpsubw 11744(%r12), %ymm4, %ymm4 +vmovdqa 12032(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 11840(%r12), %ymm5, %ymm5 +vpsubw 11552(%r12), %ymm4, %ymm4 +vpaddw 11936(%r12), %ymm4, %ymm4 +vpsubw 10976(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 10400(%r12), %ymm1, %ymm1 +vpaddw 11552(%r12), %ymm1, %ymm1 +vmovdqa 10688(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 11840(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 11264(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 10400(%r12), %ymm8 +vmovdqa 11264(%r12), %ymm9 +vmovdqa %ymm8, 1536(%rsp) +vmovdqa %ymm0, 1568(%rsp) +vmovdqa %ymm1, 1600(%rsp) +vmovdqa %ymm7, 1632(%rsp) +vmovdqa %ymm5, 1664(%rsp) +vmovdqa %ymm2, 1696(%rsp) +vmovdqa %ymm3, 1728(%rsp) +vmovdqa %ymm9, 1760(%rsp) +vmovdqa 0(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vmovdqa 256(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm10 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 512(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm10, %ymm9 +vpaddd %ymm6, %ymm4, %ymm2 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm5, %ymm10, %ymm5 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1536(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm4 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm10, %ymm10 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm10, %ymm2, %ymm2 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpackusdw %ymm2, %ymm9, %ymm2 +vmovdqa 768(%rsp), %ymm9 +vpaddw 1024(%rsp), %ymm9, %ymm10 +vpsubw 1024(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm11, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpsllw $7, %ymm5, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vmovdqa 1280(%rsp), %ymm4 +vpsubw %ymm11, %ymm4, %ymm4 +vpmullw %ymm15, %ymm5, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpmullw %ymm12, %ymm10, %ymm4 +vpaddw %ymm4, %ymm2, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm3 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm8 +vpor %ymm8, %ymm10, %ymm10 +vpaddw 2048(%rsp), %ymm11, %ymm11 +vpaddw %ymm10, %ymm11, %ymm11 +vmovdqa %xmm3, 2048(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm3 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vpaddw 2304(%rsp), %ymm6, %ymm6 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm3, 2304(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm3 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm8 +vpor %ymm8, %ymm5, %ymm5 +vpaddw 2560(%rsp), %ymm2, %ymm2 +vpaddw %ymm5, %ymm2, %ymm2 +vmovdqa %xmm3, 2560(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 32(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 384(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 736(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1088(%rdi) +vmovdqa 32(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm4 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm10, %ymm10 +vmovdqa 288(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm2 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 544(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm2, %ymm3 +vpaddd %ymm6, %ymm9, %ymm8 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm11, %ymm2, %ymm11 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1568(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm9 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm2, %ymm2 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm2, %ymm8, %ymm8 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm3, %ymm8 +vmovdqa 800(%rsp), %ymm3 +vpaddw 1056(%rsp), %ymm3, %ymm2 +vpsubw 1056(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm5, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpsllw $7, %ymm11, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vmovdqa 1312(%rsp), %ymm9 +vpsubw %ymm5, %ymm9, %ymm9 +vpmullw %ymm15, %ymm11, %ymm10 +vpsubw %ymm10, %ymm9, %ymm10 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpmullw %ymm12, %ymm2, %ymm9 +vpaddw %ymm9, %ymm8, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm10, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm10 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm4 +vpor %ymm4, %ymm2, %ymm2 +vpaddw 2080(%rsp), %ymm5, %ymm5 +vpaddw %ymm2, %ymm5, %ymm5 +vmovdqa %xmm10, 2080(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm10 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vpaddw 2336(%rsp), %ymm6, %ymm6 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm10, 2336(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm10 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm4 +vpor %ymm4, %ymm11, %ymm11 +vpaddw 2592(%rsp), %ymm8, %ymm8 +vpaddw %ymm11, %ymm8, %ymm8 +vmovdqa %xmm10, 2592(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 120(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 472(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 824(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1176(%rdi) +vmovdqa 64(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm9 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm2, %ymm2 +vmovdqa 320(%rsp), %ymm3 +vpunpcklwd const0(%rip), %ymm3, %ymm8 +vpunpckhwd const0(%rip), %ymm3, %ymm3 +vmovdqa 576(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm8, %ymm10 +vpaddd %ymm6, %ymm3, %ymm4 +vpsubd %ymm9, %ymm10, %ymm10 +vpsubd %ymm2, %ymm4, %ymm4 +vpsubd %ymm5, %ymm8, %ymm5 +vpsubd %ymm6, %ymm3, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1600(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm3, %ymm10, %ymm10 +vpsubd %ymm8, %ymm4, %ymm4 +vpsrld $1, %ymm10, %ymm10 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm10, %ymm4 +vmovdqa 832(%rsp), %ymm10 +vpaddw 1088(%rsp), %ymm10, %ymm8 +vpsubw 1088(%rsp), %ymm10, %ymm10 +vpsrlw $2, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsllw $1, %ymm11, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpsllw $7, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm4, %ymm8, %ymm8 +vmovdqa 1344(%rsp), %ymm3 +vpsubw %ymm11, %ymm3, %ymm3 +vpmullw %ymm15, %ymm5, %ymm2 +vpsubw %ymm2, %ymm3, %ymm2 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm4 +vpmullw %ymm12, %ymm8, %ymm3 +vpaddw %ymm3, %ymm4, %ymm3 +vpmullw %ymm12, %ymm3, %ymm3 +vpsubw %ymm3, %ymm2, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpmullw %ymm13, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm2 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm9 +vpor %ymm9, %ymm8, %ymm8 +vpaddw 2112(%rsp), %ymm11, %ymm11 +vpaddw %ymm8, %ymm11, %ymm11 +vmovdqa %xmm2, 2112(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm2 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm9 +vpor %ymm9, %ymm3, %ymm3 +vpaddw 2368(%rsp), %ymm6, %ymm6 +vpaddw %ymm3, %ymm6, %ymm6 +vmovdqa %xmm2, 2368(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm2 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm9 +vpor %ymm9, %ymm5, %ymm5 +vpaddw 2624(%rsp), %ymm4, %ymm4 +vpaddw %ymm5, %ymm4, %ymm4 +vmovdqa %xmm2, 2624(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 208(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 560(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %ymm4, 912(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 1264(%rdi) +vmovdqa 96(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm3 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm8, %ymm8 +vmovdqa 352(%rsp), %ymm10 +vpunpcklwd const0(%rip), %ymm10, %ymm4 +vpunpckhwd const0(%rip), %ymm10, %ymm10 +vmovdqa 608(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm4, %ymm2 +vpaddd %ymm6, %ymm10, %ymm9 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm8, %ymm9, %ymm9 +vpsubd %ymm11, %ymm4, %ymm11 +vpsubd %ymm6, %ymm10, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1632(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm4 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm4, %ymm4 +vpsubd %ymm10, %ymm2, %ymm2 +vpsubd %ymm4, %ymm9, %ymm9 +vpsrld $1, %ymm2, %ymm2 +vpsrld $1, %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpackusdw %ymm9, %ymm2, %ymm9 +vmovdqa 864(%rsp), %ymm2 +vpaddw 1120(%rsp), %ymm2, %ymm4 +vpsubw 1120(%rsp), %ymm2, %ymm2 +vpsrlw $2, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsllw $1, %ymm5, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpsllw $7, %ymm11, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vmovdqa 1376(%rsp), %ymm10 +vpsubw %ymm5, %ymm10, %ymm10 +vpmullw %ymm15, %ymm11, %ymm8 +vpsubw %ymm8, %ymm10, %ymm8 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpmullw %ymm12, %ymm4, %ymm10 +vpaddw %ymm10, %ymm9, %ymm10 +vpmullw %ymm12, %ymm10, %ymm10 +vpsubw %ymm10, %ymm8, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpmullw %ymm13, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm8 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm4, %ymm4 +vpaddw 2144(%rsp), %ymm5, %ymm5 +vpaddw %ymm4, %ymm5, %ymm5 +vmovdqa %xmm8, 2144(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm8 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm10, %ymm10 +vpaddw 2400(%rsp), %ymm6, %ymm6 +vpaddw %ymm10, %ymm6, %ymm6 +vmovdqa %xmm8, 2400(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm8 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm3 +vpor %ymm3, %ymm11, %ymm11 +vpaddw 2656(%rsp), %ymm9, %ymm9 +vpaddw %ymm11, %ymm9, %ymm9 +vmovdqa %xmm8, 2656(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 296(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 648(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %ymm9, 1000(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 1352(%rdi) +vmovdqa 128(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm10 +vpunpckhwd const0(%rip), %ymm11, %ymm4 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm4, %ymm4 +vmovdqa 384(%rsp), %ymm2 +vpunpcklwd const0(%rip), %ymm2, %ymm9 +vpunpckhwd const0(%rip), %ymm2, %ymm2 +vmovdqa 640(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm9, %ymm8 +vpaddd %ymm6, %ymm2, %ymm3 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm5, %ymm9, %ymm5 +vpsubd %ymm6, %ymm2, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1664(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm9 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm9, %ymm9 +vpsubd %ymm2, %ymm8, %ymm8 +vpsubd %ymm9, %ymm3, %ymm3 +vpsrld $1, %ymm8, %ymm8 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm8, %ymm3 +vmovdqa 896(%rsp), %ymm8 +vpaddw 1152(%rsp), %ymm8, %ymm9 +vpsubw 1152(%rsp), %ymm8, %ymm8 +vpsrlw $2, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsllw $1, %ymm11, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpsllw $7, %ymm5, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vmovdqa 1408(%rsp), %ymm2 +vpsubw %ymm11, %ymm2, %ymm2 +vpmullw %ymm15, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpmullw %ymm12, %ymm9, %ymm2 +vpaddw %ymm2, %ymm3, %ymm2 +vpmullw %ymm12, %ymm2, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpmullw %ymm13, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vmovdqu 384(%rdi), %ymm4 +vmovdqu 736(%rdi), %ymm10 +vmovdqu 1088(%rdi), %ymm7 +vpaddw %ymm11, %ymm4, %ymm11 +vpaddw %ymm6, %ymm10, %ymm6 +vpaddw %ymm3, %ymm7, %ymm3 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm7 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm8, %ymm8 +vmovdqu 32(%rdi), %ymm10 +vpaddw 1920(%rsp), %ymm10, %ymm10 +vpaddw %ymm8, %ymm10, %ymm10 +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 32(%rdi) +vmovdqa %xmm7, 1920(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm7 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm9, %ymm9 +vpaddw 2176(%rsp), %ymm11, %ymm11 +vpaddw %ymm9, %ymm11, %ymm11 +vmovdqa %xmm7, 2176(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm7 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm2, %ymm2 +vpaddw 2432(%rsp), %ymm6, %ymm6 +vpaddw %ymm2, %ymm6, %ymm6 +vmovdqa %xmm7, 2432(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm7 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm10 +vpor %ymm10, %ymm5, %ymm5 +vpaddw 2688(%rsp), %ymm3, %ymm3 +vpaddw %ymm5, %ymm3, %ymm3 +vmovdqa %xmm7, 2688(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 384(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 736(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %ymm3, 1088(%rdi) +vmovdqa 160(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm9 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm9, %ymm9 +vmovdqa 416(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm3 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 672(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm3, %ymm7 +vpaddd %ymm6, %ymm8, %ymm10 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm9, %ymm10, %ymm10 +vpsubd %ymm11, %ymm3, %ymm11 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1696(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm8, %ymm7, %ymm7 +vpsubd %ymm3, %ymm10, %ymm10 +vpsrld $1, %ymm7, %ymm7 +vpsrld $1, %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpackusdw %ymm10, %ymm7, %ymm10 +vmovdqa 928(%rsp), %ymm7 +vpaddw 1184(%rsp), %ymm7, %ymm3 +vpsubw 1184(%rsp), %ymm7, %ymm7 +vpsrlw $2, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsllw $1, %ymm5, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpsllw $7, %ymm11, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm10, %ymm3, %ymm3 +vmovdqa 1440(%rsp), %ymm8 +vpsubw %ymm5, %ymm8, %ymm8 +vpmullw %ymm15, %ymm11, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm10, %ymm10 +vpmullw %ymm12, %ymm3, %ymm8 +vpaddw %ymm8, %ymm10, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vmovdqu 472(%rdi), %ymm9 +vmovdqu 824(%rdi), %ymm2 +vmovdqu 1176(%rdi), %ymm4 +vpaddw %ymm5, %ymm9, %ymm5 +vpaddw %ymm6, %ymm2, %ymm6 +vpaddw %ymm10, %ymm4, %ymm10 +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm4 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm7, %ymm7 +vmovdqu 120(%rdi), %ymm2 +vpaddw 1952(%rsp), %ymm2, %ymm2 +vpaddw %ymm7, %ymm2, %ymm2 +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 120(%rdi) +vmovdqa %xmm4, 1952(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_3_5(%rip), %ymm3, %ymm4 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm3, %ymm3 +vpaddw 2208(%rsp), %ymm5, %ymm5 +vpaddw %ymm3, %ymm5, %ymm5 +vmovdqa %xmm4, 2208(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_3_5(%rip), %ymm8, %ymm4 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm8, %ymm8 +vpaddw 2464(%rsp), %ymm6, %ymm6 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm4, 2464(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm4 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm11, %ymm11 +vpaddw 2720(%rsp), %ymm10, %ymm10 +vpaddw %ymm11, %ymm10, %ymm10 +vmovdqa %xmm4, 2720(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 472(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 824(%rdi) +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %ymm10, 1176(%rdi) +vmovdqa 192(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm3 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm3, %ymm3 +vmovdqa 448(%rsp), %ymm7 +vpunpcklwd const0(%rip), %ymm7, %ymm10 +vpunpckhwd const0(%rip), %ymm7, %ymm7 +vmovdqa 704(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm10, %ymm4 +vpaddd %ymm6, %ymm7, %ymm2 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm5, %ymm10, %ymm5 +vpsubd %ymm6, %ymm7, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1728(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm10, %ymm10 +vpsubd %ymm7, %ymm4, %ymm4 +vpsubd %ymm10, %ymm2, %ymm2 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpackusdw %ymm2, %ymm4, %ymm2 +vmovdqa 960(%rsp), %ymm4 +vpaddw 1216(%rsp), %ymm4, %ymm10 +vpsubw 1216(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm11, %ymm7 +vpsubw %ymm7, %ymm10, %ymm7 +vpsllw $7, %ymm5, %ymm10 +vpsubw %ymm10, %ymm7, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm2, %ymm10, %ymm10 +vmovdqa 1472(%rsp), %ymm7 +vpsubw %ymm11, %ymm7, %ymm7 +vpmullw %ymm15, %ymm5, %ymm3 +vpsubw %ymm3, %ymm7, %ymm3 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm2 +vpmullw %ymm12, %ymm10, %ymm7 +vpaddw %ymm7, %ymm2, %ymm7 +vpmullw %ymm12, %ymm7, %ymm7 +vpsubw %ymm7, %ymm3, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vmovdqu 560(%rdi), %ymm3 +vmovdqu 912(%rdi), %ymm8 +vmovdqu 1264(%rdi), %ymm9 +vpaddw %ymm11, %ymm3, %ymm11 +vpaddw %ymm6, %ymm8, %ymm6 +vpaddw %ymm2, %ymm9, %ymm2 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm9 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vmovdqu 208(%rdi), %ymm8 +vpaddw 1984(%rsp), %ymm8, %ymm8 +vpaddw %ymm4, %ymm8, %ymm8 +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 208(%rdi) +vmovdqa %xmm9, 1984(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_3_5(%rip), %ymm10, %ymm9 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm10, %ymm10 +vpaddw 2240(%rsp), %ymm11, %ymm11 +vpaddw %ymm10, %ymm11, %ymm11 +vmovdqa %xmm9, 2240(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_3_5(%rip), %ymm7, %ymm9 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm7, %ymm7 +vpaddw 2496(%rsp), %ymm6, %ymm6 +vpaddw %ymm7, %ymm6, %ymm6 +vmovdqa %xmm9, 2496(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_3_5(%rip), %ymm5, %ymm9 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $206, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm8 +vpor %ymm8, %ymm5, %ymm5 +vpaddw 2752(%rsp), %ymm2, %ymm2 +vpaddw %ymm5, %ymm2, %ymm2 +vmovdqa %xmm9, 2752(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 560(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 912(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %ymm2, 1264(%rdi) +vmovdqa 224(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm10 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm10, %ymm10 +vmovdqa 480(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm2 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 736(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm2, %ymm9 +vpaddd %ymm6, %ymm4, %ymm8 +vpsubd %ymm7, %ymm9, %ymm9 +vpsubd %ymm10, %ymm8, %ymm8 +vpsubd %ymm11, %ymm2, %ymm11 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1760(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm2, %ymm8, %ymm8 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm9, %ymm8 +vmovdqa 992(%rsp), %ymm9 +vpaddw 1248(%rsp), %ymm9, %ymm2 +vpsubw 1248(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpsllw $7, %ymm11, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vmovdqa 1504(%rsp), %ymm4 +vpsubw %ymm5, %ymm4, %ymm4 +vpmullw %ymm15, %ymm11, %ymm10 +vpsubw %ymm10, %ymm4, %ymm10 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpmullw %ymm12, %ymm2, %ymm4 +vpaddw %ymm4, %ymm8, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm10, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vmovdqu 648(%rdi), %ymm10 +vmovdqu 1000(%rdi), %ymm7 +vmovdqu 1352(%rdi), %ymm3 +vpaddw %ymm5, %ymm10, %ymm5 +vpaddw %ymm6, %ymm7, %ymm6 +vpaddw %ymm8, %ymm3, %ymm8 +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_3_5(%rip), %ymm9, %ymm3 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm9, %ymm9 +vmovdqu 296(%rdi), %ymm7 +vpaddw 2016(%rsp), %ymm7, %ymm7 +vpaddw %ymm9, %ymm7, %ymm7 +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %ymm7, 296(%rdi) +vmovdqa %xmm3, 2016(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_3_5(%rip), %ymm2, %ymm3 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm2, %ymm2 +vpaddw 2272(%rsp), %ymm5, %ymm5 +vpaddw %ymm2, %ymm5, %ymm5 +vmovdqa %xmm3, 2272(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_3_5(%rip), %ymm4, %ymm3 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm4, %ymm4 +vpaddw 2528(%rsp), %ymm6, %ymm6 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm3, 2528(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_3_5(%rip), %ymm11, %ymm3 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $206, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm7 +vpor %ymm7, %ymm11, %ymm11 +vpaddw 2784(%rsp), %ymm8, %ymm8 +vpaddw %ymm11, %ymm8, %ymm8 +vmovdqa %xmm3, 2784(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %ymm5, 648(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %ymm6, 1000(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %ymm8, 1352(%rdi) +vmovdqa 160(%r12), %ymm0 +vpsubw 256(%r12), %ymm0, %ymm0 +vmovdqa 544(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 352(%r12), %ymm1, %ymm1 +vpsubw 64(%r12), %ymm0, %ymm0 +vpaddw 448(%r12), %ymm0, %ymm0 +vmovdqa 736(%r12), %ymm2 +vpsubw 832(%r12), %ymm2, %ymm2 +vmovdqa 1120(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 928(%r12), %ymm3, %ymm3 +vpsubw 640(%r12), %ymm2, %ymm2 +vpaddw 1024(%r12), %ymm2, %ymm2 +vmovdqa 1312(%r12), %ymm4 +vpsubw 1408(%r12), %ymm4, %ymm4 +vmovdqa 1696(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 1504(%r12), %ymm5, %ymm5 +vpsubw 1216(%r12), %ymm4, %ymm4 +vpaddw 1600(%r12), %ymm4, %ymm4 +vpsubw 640(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 64(%r12), %ymm1, %ymm1 +vpaddw 1216(%r12), %ymm1, %ymm1 +vmovdqa 352(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 1504(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 928(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 64(%r12), %ymm8 +vmovdqa 928(%r12), %ymm9 +vmovdqa %ymm8, 0(%rsp) +vmovdqa %ymm0, 32(%rsp) +vmovdqa %ymm1, 64(%rsp) +vmovdqa %ymm7, 96(%rsp) +vmovdqa %ymm5, 128(%rsp) +vmovdqa %ymm2, 160(%rsp) +vmovdqa %ymm3, 192(%rsp) +vmovdqa %ymm9, 224(%rsp) +vmovdqa 1888(%r12), %ymm0 +vpsubw 1984(%r12), %ymm0, %ymm0 +vmovdqa 2272(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 2080(%r12), %ymm1, %ymm1 +vpsubw 1792(%r12), %ymm0, %ymm0 +vpaddw 2176(%r12), %ymm0, %ymm0 +vmovdqa 2464(%r12), %ymm2 +vpsubw 2560(%r12), %ymm2, %ymm2 +vmovdqa 2848(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 2656(%r12), %ymm3, %ymm3 +vpsubw 2368(%r12), %ymm2, %ymm2 +vpaddw 2752(%r12), %ymm2, %ymm2 +vmovdqa 3040(%r12), %ymm4 +vpsubw 3136(%r12), %ymm4, %ymm4 +vmovdqa 3424(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 3232(%r12), %ymm5, %ymm5 +vpsubw 2944(%r12), %ymm4, %ymm4 +vpaddw 3328(%r12), %ymm4, %ymm4 +vpsubw 2368(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 1792(%r12), %ymm1, %ymm1 +vpaddw 2944(%r12), %ymm1, %ymm1 +vmovdqa 2080(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 3232(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 2656(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 1792(%r12), %ymm8 +vmovdqa 2656(%r12), %ymm9 +vmovdqa %ymm8, 256(%rsp) +vmovdqa %ymm0, 288(%rsp) +vmovdqa %ymm1, 320(%rsp) +vmovdqa %ymm7, 352(%rsp) +vmovdqa %ymm5, 384(%rsp) +vmovdqa %ymm2, 416(%rsp) +vmovdqa %ymm3, 448(%rsp) +vmovdqa %ymm9, 480(%rsp) +vmovdqa 3616(%r12), %ymm0 +vpsubw 3712(%r12), %ymm0, %ymm0 +vmovdqa 4000(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 3808(%r12), %ymm1, %ymm1 +vpsubw 3520(%r12), %ymm0, %ymm0 +vpaddw 3904(%r12), %ymm0, %ymm0 +vmovdqa 4192(%r12), %ymm2 +vpsubw 4288(%r12), %ymm2, %ymm2 +vmovdqa 4576(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 4384(%r12), %ymm3, %ymm3 +vpsubw 4096(%r12), %ymm2, %ymm2 +vpaddw 4480(%r12), %ymm2, %ymm2 +vmovdqa 4768(%r12), %ymm4 +vpsubw 4864(%r12), %ymm4, %ymm4 +vmovdqa 5152(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 4960(%r12), %ymm5, %ymm5 +vpsubw 4672(%r12), %ymm4, %ymm4 +vpaddw 5056(%r12), %ymm4, %ymm4 +vpsubw 4096(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 3520(%r12), %ymm1, %ymm1 +vpaddw 4672(%r12), %ymm1, %ymm1 +vmovdqa 3808(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 4960(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 4384(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 3520(%r12), %ymm8 +vmovdqa 4384(%r12), %ymm9 +vmovdqa %ymm8, 512(%rsp) +vmovdqa %ymm0, 544(%rsp) +vmovdqa %ymm1, 576(%rsp) +vmovdqa %ymm7, 608(%rsp) +vmovdqa %ymm5, 640(%rsp) +vmovdqa %ymm2, 672(%rsp) +vmovdqa %ymm3, 704(%rsp) +vmovdqa %ymm9, 736(%rsp) +vmovdqa 5344(%r12), %ymm0 +vpsubw 5440(%r12), %ymm0, %ymm0 +vmovdqa 5728(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 5536(%r12), %ymm1, %ymm1 +vpsubw 5248(%r12), %ymm0, %ymm0 +vpaddw 5632(%r12), %ymm0, %ymm0 +vmovdqa 5920(%r12), %ymm2 +vpsubw 6016(%r12), %ymm2, %ymm2 +vmovdqa 6304(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 6112(%r12), %ymm3, %ymm3 +vpsubw 5824(%r12), %ymm2, %ymm2 +vpaddw 6208(%r12), %ymm2, %ymm2 +vmovdqa 6496(%r12), %ymm4 +vpsubw 6592(%r12), %ymm4, %ymm4 +vmovdqa 6880(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 6688(%r12), %ymm5, %ymm5 +vpsubw 6400(%r12), %ymm4, %ymm4 +vpaddw 6784(%r12), %ymm4, %ymm4 +vpsubw 5824(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 5248(%r12), %ymm1, %ymm1 +vpaddw 6400(%r12), %ymm1, %ymm1 +vmovdqa 5536(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 6688(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 6112(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 5248(%r12), %ymm8 +vmovdqa 6112(%r12), %ymm9 +vmovdqa %ymm8, 768(%rsp) +vmovdqa %ymm0, 800(%rsp) +vmovdqa %ymm1, 832(%rsp) +vmovdqa %ymm7, 864(%rsp) +vmovdqa %ymm5, 896(%rsp) +vmovdqa %ymm2, 928(%rsp) +vmovdqa %ymm3, 960(%rsp) +vmovdqa %ymm9, 992(%rsp) +vmovdqa 7072(%r12), %ymm0 +vpsubw 7168(%r12), %ymm0, %ymm0 +vmovdqa 7456(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 7264(%r12), %ymm1, %ymm1 +vpsubw 6976(%r12), %ymm0, %ymm0 +vpaddw 7360(%r12), %ymm0, %ymm0 +vmovdqa 7648(%r12), %ymm2 +vpsubw 7744(%r12), %ymm2, %ymm2 +vmovdqa 8032(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 7840(%r12), %ymm3, %ymm3 +vpsubw 7552(%r12), %ymm2, %ymm2 +vpaddw 7936(%r12), %ymm2, %ymm2 +vmovdqa 8224(%r12), %ymm4 +vpsubw 8320(%r12), %ymm4, %ymm4 +vmovdqa 8608(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 8416(%r12), %ymm5, %ymm5 +vpsubw 8128(%r12), %ymm4, %ymm4 +vpaddw 8512(%r12), %ymm4, %ymm4 +vpsubw 7552(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 6976(%r12), %ymm1, %ymm1 +vpaddw 8128(%r12), %ymm1, %ymm1 +vmovdqa 7264(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 8416(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 7840(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 6976(%r12), %ymm8 +vmovdqa 7840(%r12), %ymm9 +vmovdqa %ymm8, 1024(%rsp) +vmovdqa %ymm0, 1056(%rsp) +vmovdqa %ymm1, 1088(%rsp) +vmovdqa %ymm7, 1120(%rsp) +vmovdqa %ymm5, 1152(%rsp) +vmovdqa %ymm2, 1184(%rsp) +vmovdqa %ymm3, 1216(%rsp) +vmovdqa %ymm9, 1248(%rsp) +vmovdqa 8800(%r12), %ymm0 +vpsubw 8896(%r12), %ymm0, %ymm0 +vmovdqa 9184(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 8992(%r12), %ymm1, %ymm1 +vpsubw 8704(%r12), %ymm0, %ymm0 +vpaddw 9088(%r12), %ymm0, %ymm0 +vmovdqa 9376(%r12), %ymm2 +vpsubw 9472(%r12), %ymm2, %ymm2 +vmovdqa 9760(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 9568(%r12), %ymm3, %ymm3 +vpsubw 9280(%r12), %ymm2, %ymm2 +vpaddw 9664(%r12), %ymm2, %ymm2 +vmovdqa 9952(%r12), %ymm4 +vpsubw 10048(%r12), %ymm4, %ymm4 +vmovdqa 10336(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 10144(%r12), %ymm5, %ymm5 +vpsubw 9856(%r12), %ymm4, %ymm4 +vpaddw 10240(%r12), %ymm4, %ymm4 +vpsubw 9280(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 8704(%r12), %ymm1, %ymm1 +vpaddw 9856(%r12), %ymm1, %ymm1 +vmovdqa 8992(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 10144(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 9568(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 8704(%r12), %ymm8 +vmovdqa 9568(%r12), %ymm9 +vmovdqa %ymm8, 1280(%rsp) +vmovdqa %ymm0, 1312(%rsp) +vmovdqa %ymm1, 1344(%rsp) +vmovdqa %ymm7, 1376(%rsp) +vmovdqa %ymm5, 1408(%rsp) +vmovdqa %ymm2, 1440(%rsp) +vmovdqa %ymm3, 1472(%rsp) +vmovdqa %ymm9, 1504(%rsp) +vmovdqa 10528(%r12), %ymm0 +vpsubw 10624(%r12), %ymm0, %ymm0 +vmovdqa 10912(%r12), %ymm1 +vpsubw %ymm0, %ymm1, %ymm1 +vpsubw 10720(%r12), %ymm1, %ymm1 +vpsubw 10432(%r12), %ymm0, %ymm0 +vpaddw 10816(%r12), %ymm0, %ymm0 +vmovdqa 11104(%r12), %ymm2 +vpsubw 11200(%r12), %ymm2, %ymm2 +vmovdqa 11488(%r12), %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw 11296(%r12), %ymm3, %ymm3 +vpsubw 11008(%r12), %ymm2, %ymm2 +vpaddw 11392(%r12), %ymm2, %ymm2 +vmovdqa 11680(%r12), %ymm4 +vpsubw 11776(%r12), %ymm4, %ymm4 +vmovdqa 12064(%r12), %ymm5 +vpsubw %ymm4, %ymm5, %ymm5 +vpsubw 11872(%r12), %ymm5, %ymm5 +vpsubw 11584(%r12), %ymm4, %ymm4 +vpaddw 11968(%r12), %ymm4, %ymm4 +vpsubw 11008(%r12), %ymm1, %ymm1 +vpsubw %ymm1, %ymm5, %ymm5 +vpsubw %ymm3, %ymm5, %ymm5 +vpsubw 10432(%r12), %ymm1, %ymm1 +vpaddw 11584(%r12), %ymm1, %ymm1 +vmovdqa 10720(%r12), %ymm6 +vpsubw %ymm2, %ymm6, %ymm7 +vmovdqa 11872(%r12), %ymm2 +vpsubw %ymm7, %ymm2, %ymm2 +vpsubw 11296(%r12), %ymm2, %ymm2 +vpsubw %ymm0, %ymm7, %ymm7 +vpaddw %ymm4, %ymm7, %ymm7 +vmovdqa 10432(%r12), %ymm8 +vmovdqa 11296(%r12), %ymm9 +vmovdqa %ymm8, 1536(%rsp) +vmovdqa %ymm0, 1568(%rsp) +vmovdqa %ymm1, 1600(%rsp) +vmovdqa %ymm7, 1632(%rsp) +vmovdqa %ymm5, 1664(%rsp) +vmovdqa %ymm2, 1696(%rsp) +vmovdqa %ymm3, 1728(%rsp) +vmovdqa %ymm9, 1760(%rsp) +vmovdqa 0(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vmovdqa 256(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm8 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 512(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm8, %ymm3 +vpaddd %ymm6, %ymm9, %ymm7 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm5, %ymm8, %ymm5 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1536(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm8, %ymm7, %ymm7 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpackusdw %ymm7, %ymm3, %ymm7 +vmovdqa 768(%rsp), %ymm3 +vpaddw 1024(%rsp), %ymm3, %ymm8 +vpsubw 1024(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm11, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpsllw $7, %ymm5, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vmovdqa 1280(%rsp), %ymm9 +vpsubw %ymm11, %ymm9, %ymm9 +vpmullw %ymm15, %ymm5, %ymm2 +vpsubw %ymm2, %ymm9, %ymm2 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpmullw %ymm12, %ymm8, %ymm9 +vpaddw %ymm9, %ymm7, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm2, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_4_3_1(%rip), %ymm8, %ymm2 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm4 +vpor %ymm4, %ymm8, %ymm8 +vpaddw 2048(%rsp), %ymm11, %ymm11 +vpaddw %ymm8, %ymm11, %ymm11 +vmovdqa %xmm2, 2048(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vpaddw 2304(%rsp), %ymm6, %ymm6 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm2, 2304(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm2 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm4 +vpor %ymm4, %ymm5, %ymm5 +vpaddw 2560(%rsp), %ymm7, %ymm7 +vpaddw %ymm5, %ymm7, %ymm7 +vmovdqa %xmm2, 2560(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 64(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 80(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 416(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 432(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 768(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vmovq %xmm7, 784(%rdi) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %xmm3, 1120(%rdi) +vextracti128 $1, %ymm3, %xmm3 +vmovq %xmm3, 1136(%rdi) +vmovdqa 32(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm9 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm8, %ymm8 +vmovdqa 288(%rsp), %ymm3 +vpunpcklwd const0(%rip), %ymm3, %ymm7 +vpunpckhwd const0(%rip), %ymm3, %ymm3 +vmovdqa 544(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm7, %ymm2 +vpaddd %ymm6, %ymm3, %ymm4 +vpsubd %ymm9, %ymm2, %ymm2 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm11, %ymm7, %ymm11 +vpsubd %ymm6, %ymm3, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1568(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm3 +vpunpckhwd const0(%rip), %ymm11, %ymm7 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm3, %ymm2, %ymm2 +vpsubd %ymm7, %ymm4, %ymm4 +vpsrld $1, %ymm2, %ymm2 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm2, %ymm4 +vmovdqa 800(%rsp), %ymm2 +vpaddw 1056(%rsp), %ymm2, %ymm7 +vpsubw 1056(%rsp), %ymm2, %ymm2 +vpsrlw $2, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsllw $1, %ymm5, %ymm3 +vpsubw %ymm3, %ymm7, %ymm3 +vpsllw $7, %ymm11, %ymm7 +vpsubw %ymm7, %ymm3, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vmovdqa 1312(%rsp), %ymm3 +vpsubw %ymm5, %ymm3, %ymm3 +vpmullw %ymm15, %ymm11, %ymm8 +vpsubw %ymm8, %ymm3, %ymm8 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpmullw %ymm12, %ymm7, %ymm3 +vpaddw %ymm3, %ymm4, %ymm3 +vpmullw %ymm12, %ymm3, %ymm3 +vpsubw %ymm3, %ymm8, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vpsubw %ymm3, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpmullw %ymm13, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_4_3_1(%rip), %ymm7, %ymm8 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $139, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm9 +vpor %ymm9, %ymm7, %ymm7 +vpaddw 2080(%rsp), %ymm5, %ymm5 +vpaddw %ymm7, %ymm5, %ymm5 +vmovdqa %xmm8, 2080(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_4_3_1(%rip), %ymm3, %ymm8 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $139, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm9 +vpor %ymm9, %ymm3, %ymm3 +vpaddw 2336(%rsp), %ymm6, %ymm6 +vpaddw %ymm3, %ymm6, %ymm6 +vmovdqa %xmm8, 2336(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm8 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm8, %ymm8 +vpand mask_keephigh(%rip), %ymm8, %ymm9 +vpor %ymm9, %ymm11, %ymm11 +vpaddw 2592(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vmovdqa %xmm8, 2592(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 152(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 168(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 504(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 520(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %xmm4, 856(%rdi) +vextracti128 $1, %ymm4, %xmm4 +vmovq %xmm4, 872(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %xmm2, 1208(%rdi) +vextracti128 $1, %ymm2, %xmm2 +vmovq %xmm2, 1224(%rdi) +vmovdqa 64(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm3 +vpunpckhwd const0(%rip), %ymm11, %ymm7 +vpslld $1, %ymm3, %ymm3 +vpslld $1, %ymm7, %ymm7 +vmovdqa 320(%rsp), %ymm2 +vpunpcklwd const0(%rip), %ymm2, %ymm4 +vpunpckhwd const0(%rip), %ymm2, %ymm2 +vmovdqa 576(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm4, %ymm8 +vpaddd %ymm6, %ymm2, %ymm9 +vpsubd %ymm3, %ymm8, %ymm8 +vpsubd %ymm7, %ymm9, %ymm9 +vpsubd %ymm5, %ymm4, %ymm5 +vpsubd %ymm6, %ymm2, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1600(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpsubd %ymm2, %ymm8, %ymm8 +vpsubd %ymm4, %ymm9, %ymm9 +vpsrld $1, %ymm8, %ymm8 +vpsrld $1, %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpackusdw %ymm9, %ymm8, %ymm9 +vmovdqa 832(%rsp), %ymm8 +vpaddw 1088(%rsp), %ymm8, %ymm4 +vpsubw 1088(%rsp), %ymm8, %ymm8 +vpsrlw $2, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsllw $1, %ymm11, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpsllw $7, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm9, %ymm4, %ymm4 +vmovdqa 1344(%rsp), %ymm2 +vpsubw %ymm11, %ymm2, %ymm2 +vpmullw %ymm15, %ymm5, %ymm7 +vpsubw %ymm7, %ymm2, %ymm7 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm4, %ymm9, %ymm9 +vpmullw %ymm12, %ymm4, %ymm2 +vpaddw %ymm2, %ymm9, %ymm2 +vpmullw %ymm12, %ymm2, %ymm2 +vpsubw %ymm2, %ymm7, %ymm2 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm6, %ymm2, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpmullw %ymm13, %ymm2, %ymm2 +vpsubw %ymm2, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_4_3_1(%rip), %ymm4, %ymm7 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $139, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm3 +vpor %ymm3, %ymm4, %ymm4 +vpaddw 2112(%rsp), %ymm11, %ymm11 +vpaddw %ymm4, %ymm11, %ymm11 +vmovdqa %xmm7, 2112(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_4_3_1(%rip), %ymm2, %ymm7 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $139, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm3 +vpor %ymm3, %ymm2, %ymm2 +vpaddw 2368(%rsp), %ymm6, %ymm6 +vpaddw %ymm2, %ymm6, %ymm6 +vmovdqa %xmm7, 2368(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm7 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm7, %ymm7 +vpand mask_keephigh(%rip), %ymm7, %ymm3 +vpor %ymm3, %ymm5, %ymm5 +vpaddw 2624(%rsp), %ymm9, %ymm9 +vpaddw %ymm5, %ymm9, %ymm9 +vmovdqa %xmm7, 2624(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 240(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 256(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 592(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 608(%rdi) +vpand mask_mod8192(%rip), %ymm9, %ymm9 +vmovdqu %xmm9, 944(%rdi) +vextracti128 $1, %ymm9, %xmm9 +vmovq %xmm9, 960(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %xmm8, 1296(%rdi) +vextracti128 $1, %ymm8, %xmm8 +vmovq %xmm8, 1312(%rdi) +vmovdqa 96(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm2 +vpunpckhwd const0(%rip), %ymm5, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpslld $1, %ymm4, %ymm4 +vmovdqa 352(%rsp), %ymm8 +vpunpcklwd const0(%rip), %ymm8, %ymm9 +vpunpckhwd const0(%rip), %ymm8, %ymm8 +vmovdqa 608(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm9, %ymm7 +vpaddd %ymm6, %ymm8, %ymm3 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm4, %ymm3, %ymm3 +vpsubd %ymm11, %ymm9, %ymm11 +vpsubd %ymm6, %ymm8, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1632(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm9, %ymm9 +vpsubd %ymm8, %ymm7, %ymm7 +vpsubd %ymm9, %ymm3, %ymm3 +vpsrld $1, %ymm7, %ymm7 +vpsrld $1, %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpackusdw %ymm3, %ymm7, %ymm3 +vmovdqa 864(%rsp), %ymm7 +vpaddw 1120(%rsp), %ymm7, %ymm9 +vpsubw 1120(%rsp), %ymm7, %ymm7 +vpsrlw $2, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsllw $1, %ymm5, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpsllw $7, %ymm11, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vmovdqa 1376(%rsp), %ymm8 +vpsubw %ymm5, %ymm8, %ymm8 +vpmullw %ymm15, %ymm11, %ymm4 +vpsubw %ymm4, %ymm8, %ymm4 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpmullw %ymm12, %ymm9, %ymm8 +vpaddw %ymm8, %ymm3, %ymm8 +vpmullw %ymm12, %ymm8, %ymm8 +vpsubw %ymm8, %ymm4, %ymm8 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm6, %ymm8, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vpmullw %ymm13, %ymm8, %ymm8 +vpsubw %ymm8, %ymm6, %ymm6 +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm4 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm9, %ymm9 +vpaddw 2144(%rsp), %ymm5, %ymm5 +vpaddw %ymm9, %ymm5, %ymm5 +vmovdqa %xmm4, 2144(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_4_3_1(%rip), %ymm8, %ymm4 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $139, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm8, %ymm8 +vpaddw 2400(%rsp), %ymm6, %ymm6 +vpaddw %ymm8, %ymm6, %ymm6 +vmovdqa %xmm4, 2400(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm4 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm4, %ymm4 +vpand mask_keephigh(%rip), %ymm4, %ymm2 +vpor %ymm2, %ymm11, %ymm11 +vpaddw 2656(%rsp), %ymm3, %ymm3 +vpaddw %ymm11, %ymm3, %ymm3 +vmovdqa %xmm4, 2656(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 328(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 344(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm5, %ymm5 +vmovdqa %xmm5, 1792(%rsp) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 680(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 696(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm6, %ymm6 +vmovdqa %xmm6, 1824(%rsp) +vpand mask_mod8192(%rip), %ymm3, %ymm3 +vmovdqu %xmm3, 1032(%rdi) +vextracti128 $1, %ymm3, %xmm3 +vmovq %xmm3, 1048(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm3, %ymm3 +vmovdqa %xmm3, 1856(%rsp) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 1384(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vpextrw $0, %xmm7, 1400(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm7, %ymm7 +vmovdqa %xmm7, 1888(%rsp) +vmovdqa 128(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm8 +vpunpckhwd const0(%rip), %ymm11, %ymm9 +vpslld $1, %ymm8, %ymm8 +vpslld $1, %ymm9, %ymm9 +vmovdqa 384(%rsp), %ymm7 +vpunpcklwd const0(%rip), %ymm7, %ymm3 +vpunpckhwd const0(%rip), %ymm7, %ymm7 +vmovdqa 640(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm3, %ymm4 +vpaddd %ymm6, %ymm7, %ymm2 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm9, %ymm2, %ymm2 +vpsubd %ymm5, %ymm3, %ymm5 +vpsubd %ymm6, %ymm7, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1664(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm3 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm3, %ymm3 +vpsubd %ymm7, %ymm4, %ymm4 +vpsubd %ymm3, %ymm2, %ymm2 +vpsrld $1, %ymm4, %ymm4 +vpsrld $1, %ymm2, %ymm2 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm2, %ymm2 +vpackusdw %ymm2, %ymm4, %ymm2 +vmovdqa 896(%rsp), %ymm4 +vpaddw 1152(%rsp), %ymm4, %ymm3 +vpsubw 1152(%rsp), %ymm4, %ymm4 +vpsrlw $2, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsllw $1, %ymm11, %ymm7 +vpsubw %ymm7, %ymm3, %ymm7 +vpsllw $7, %ymm5, %ymm3 +vpsubw %ymm3, %ymm7, %ymm3 +vpsrlw $3, %ymm3, %ymm3 +vpsubw %ymm2, %ymm3, %ymm3 +vmovdqa 1408(%rsp), %ymm7 +vpsubw %ymm11, %ymm7, %ymm7 +vpmullw %ymm15, %ymm5, %ymm9 +vpsubw %ymm9, %ymm7, %ymm9 +vpmullw %ymm14, %ymm3, %ymm3 +vpsubw %ymm3, %ymm2, %ymm2 +vpmullw %ymm12, %ymm3, %ymm7 +vpaddw %ymm7, %ymm2, %ymm7 +vpmullw %ymm12, %ymm7, %ymm7 +vpsubw %ymm7, %ymm9, %ymm7 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm6, %ymm7, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vpmullw %ymm13, %ymm7, %ymm7 +vpsubw %ymm7, %ymm6, %ymm6 +vmovdqu 416(%rdi), %ymm9 +vmovdqu 768(%rdi), %ymm8 +vmovdqu 1120(%rdi), %ymm10 +vpaddw %ymm11, %ymm9, %ymm11 +vpaddw %ymm6, %ymm8, %ymm6 +vpaddw %ymm2, %ymm10, %ymm2 +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_4_3_1(%rip), %ymm4, %ymm10 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm4, %ymm4 +vmovdqu 64(%rdi), %ymm8 +vpaddw 1920(%rsp), %ymm8, %ymm8 +vpaddw %ymm4, %ymm8, %ymm8 +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %xmm8, 64(%rdi) +vextracti128 $1, %ymm8, %xmm8 +vmovq %xmm8, 80(%rdi) +vmovdqa %xmm10, 1920(%rsp) +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_4_3_1(%rip), %ymm3, %ymm10 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm3, %ymm3 +vpaddw 2176(%rsp), %ymm11, %ymm11 +vpaddw %ymm3, %ymm11, %ymm11 +vmovdqa %xmm10, 2176(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_4_3_1(%rip), %ymm7, %ymm10 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm7, %ymm7 +vpaddw 2432(%rsp), %ymm6, %ymm6 +vpaddw %ymm7, %ymm6, %ymm6 +vmovdqa %xmm10, 2432(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm10 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm10, %ymm10 +vpand mask_keephigh(%rip), %ymm10, %ymm8 +vpor %ymm8, %ymm5, %ymm5 +vpaddw 2688(%rsp), %ymm2, %ymm2 +vpaddw %ymm5, %ymm2, %ymm2 +vmovdqa %xmm10, 2688(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 416(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 432(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 768(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 784(%rdi) +vpand mask_mod8192(%rip), %ymm2, %ymm2 +vmovdqu %xmm2, 1120(%rdi) +vextracti128 $1, %ymm2, %xmm2 +vmovq %xmm2, 1136(%rdi) +vmovdqa 160(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm7 +vpunpckhwd const0(%rip), %ymm5, %ymm3 +vpslld $1, %ymm7, %ymm7 +vpslld $1, %ymm3, %ymm3 +vmovdqa 416(%rsp), %ymm4 +vpunpcklwd const0(%rip), %ymm4, %ymm2 +vpunpckhwd const0(%rip), %ymm4, %ymm4 +vmovdqa 672(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm2, %ymm10 +vpaddd %ymm6, %ymm4, %ymm8 +vpsubd %ymm7, %ymm10, %ymm10 +vpsubd %ymm3, %ymm8, %ymm8 +vpsubd %ymm11, %ymm2, %ymm11 +vpsubd %ymm6, %ymm4, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1696(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vpsubd %ymm4, %ymm10, %ymm10 +vpsubd %ymm2, %ymm8, %ymm8 +vpsrld $1, %ymm10, %ymm10 +vpsrld $1, %ymm8, %ymm8 +vpand mask32_to_16(%rip), %ymm10, %ymm10 +vpand mask32_to_16(%rip), %ymm8, %ymm8 +vpackusdw %ymm8, %ymm10, %ymm8 +vmovdqa 928(%rsp), %ymm10 +vpaddw 1184(%rsp), %ymm10, %ymm2 +vpsubw 1184(%rsp), %ymm10, %ymm10 +vpsrlw $2, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsllw $1, %ymm5, %ymm4 +vpsubw %ymm4, %ymm2, %ymm4 +vpsllw $7, %ymm11, %ymm2 +vpsubw %ymm2, %ymm4, %ymm2 +vpsrlw $3, %ymm2, %ymm2 +vpsubw %ymm8, %ymm2, %ymm2 +vmovdqa 1440(%rsp), %ymm4 +vpsubw %ymm5, %ymm4, %ymm4 +vpmullw %ymm15, %ymm11, %ymm3 +vpsubw %ymm3, %ymm4, %ymm3 +vpmullw %ymm14, %ymm2, %ymm2 +vpsubw %ymm2, %ymm8, %ymm8 +vpmullw %ymm12, %ymm2, %ymm4 +vpaddw %ymm4, %ymm8, %ymm4 +vpmullw %ymm12, %ymm4, %ymm4 +vpsubw %ymm4, %ymm3, %ymm4 +vpmullw %ymm14, %ymm4, %ymm4 +vpsubw %ymm6, %ymm4, %ymm4 +vpsrlw $3, %ymm4, %ymm4 +vpsubw %ymm10, %ymm4, %ymm4 +vpsubw %ymm4, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vpmullw %ymm13, %ymm4, %ymm4 +vpsubw %ymm4, %ymm6, %ymm6 +vmovdqu 504(%rdi), %ymm3 +vmovdqu 856(%rdi), %ymm7 +vmovdqu 1208(%rdi), %ymm9 +vpaddw %ymm5, %ymm3, %ymm5 +vpaddw %ymm6, %ymm7, %ymm6 +vpaddw %ymm8, %ymm9, %ymm8 +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_4_3_1(%rip), %ymm10, %ymm9 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm10, %ymm10 +vmovdqu 152(%rdi), %ymm7 +vpaddw 1952(%rsp), %ymm7, %ymm7 +vpaddw %ymm10, %ymm7, %ymm7 +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 152(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vmovq %xmm7, 168(%rdi) +vmovdqa %xmm9, 1952(%rsp) +vpshufb shuf48_16(%rip), %ymm2, %ymm2 +vpand mask3_5_4_3_1(%rip), %ymm2, %ymm9 +vpand mask5_3_5_3(%rip), %ymm2, %ymm2 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm2, %ymm2 +vpaddw 2208(%rsp), %ymm5, %ymm5 +vpaddw %ymm2, %ymm5, %ymm5 +vmovdqa %xmm9, 2208(%rsp) +vpshufb shuf48_16(%rip), %ymm4, %ymm4 +vpand mask3_5_4_3_1(%rip), %ymm4, %ymm9 +vpand mask5_3_5_3(%rip), %ymm4, %ymm4 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm4, %ymm4 +vpaddw 2464(%rsp), %ymm6, %ymm6 +vpaddw %ymm4, %ymm6, %ymm6 +vmovdqa %xmm9, 2464(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm9 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm9, %ymm9 +vpand mask_keephigh(%rip), %ymm9, %ymm7 +vpor %ymm7, %ymm11, %ymm11 +vpaddw 2720(%rsp), %ymm8, %ymm8 +vpaddw %ymm11, %ymm8, %ymm8 +vmovdqa %xmm9, 2720(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 504(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 520(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 856(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 872(%rdi) +vpand mask_mod8192(%rip), %ymm8, %ymm8 +vmovdqu %xmm8, 1208(%rdi) +vextracti128 $1, %ymm8, %xmm8 +vmovq %xmm8, 1224(%rdi) +vmovdqa 192(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm4 +vpunpckhwd const0(%rip), %ymm11, %ymm2 +vpslld $1, %ymm4, %ymm4 +vpslld $1, %ymm2, %ymm2 +vmovdqa 448(%rsp), %ymm10 +vpunpcklwd const0(%rip), %ymm10, %ymm8 +vpunpckhwd const0(%rip), %ymm10, %ymm10 +vmovdqa 704(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm5 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm5, %ymm8, %ymm9 +vpaddd %ymm6, %ymm10, %ymm7 +vpsubd %ymm4, %ymm9, %ymm9 +vpsubd %ymm2, %ymm7, %ymm7 +vpsubd %ymm5, %ymm8, %ymm5 +vpsubd %ymm6, %ymm10, %ymm6 +vpsrld $1, %ymm5, %ymm5 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm5, %ymm5 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm5, %ymm6 +vmovdqa 1728(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm10 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm8, %ymm8 +vpsubd %ymm10, %ymm9, %ymm9 +vpsubd %ymm8, %ymm7, %ymm7 +vpsrld $1, %ymm9, %ymm9 +vpsrld $1, %ymm7, %ymm7 +vpand mask32_to_16(%rip), %ymm9, %ymm9 +vpand mask32_to_16(%rip), %ymm7, %ymm7 +vpackusdw %ymm7, %ymm9, %ymm7 +vmovdqa 960(%rsp), %ymm9 +vpaddw 1216(%rsp), %ymm9, %ymm8 +vpsubw 1216(%rsp), %ymm9, %ymm9 +vpsrlw $2, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsllw $1, %ymm11, %ymm10 +vpsubw %ymm10, %ymm8, %ymm10 +vpsllw $7, %ymm5, %ymm8 +vpsubw %ymm8, %ymm10, %ymm8 +vpsrlw $3, %ymm8, %ymm8 +vpsubw %ymm7, %ymm8, %ymm8 +vmovdqa 1472(%rsp), %ymm10 +vpsubw %ymm11, %ymm10, %ymm10 +vpmullw %ymm15, %ymm5, %ymm2 +vpsubw %ymm2, %ymm10, %ymm2 +vpmullw %ymm14, %ymm8, %ymm8 +vpsubw %ymm8, %ymm7, %ymm7 +vpmullw %ymm12, %ymm8, %ymm10 +vpaddw %ymm10, %ymm7, %ymm10 +vpmullw %ymm12, %ymm10, %ymm10 +vpsubw %ymm10, %ymm2, %ymm10 +vpmullw %ymm14, %ymm10, %ymm10 +vpsubw %ymm6, %ymm10, %ymm10 +vpsrlw $3, %ymm10, %ymm10 +vpsubw %ymm9, %ymm10, %ymm10 +vpsubw %ymm10, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vpmullw %ymm13, %ymm10, %ymm10 +vpsubw %ymm10, %ymm6, %ymm6 +vmovdqu 592(%rdi), %ymm2 +vmovdqu 944(%rdi), %ymm4 +vmovdqu 1296(%rdi), %ymm3 +vpaddw %ymm11, %ymm2, %ymm11 +vpaddw %ymm6, %ymm4, %ymm6 +vpaddw %ymm7, %ymm3, %ymm7 +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm3 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm9, %ymm9 +vmovdqu 240(%rdi), %ymm4 +vpaddw 1984(%rsp), %ymm4, %ymm4 +vpaddw %ymm9, %ymm4, %ymm4 +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %xmm4, 240(%rdi) +vextracti128 $1, %ymm4, %xmm4 +vmovq %xmm4, 256(%rdi) +vmovdqa %xmm3, 1984(%rsp) +vpshufb shuf48_16(%rip), %ymm8, %ymm8 +vpand mask3_5_4_3_1(%rip), %ymm8, %ymm3 +vpand mask5_3_5_3(%rip), %ymm8, %ymm8 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm8, %ymm8 +vpaddw 2240(%rsp), %ymm11, %ymm11 +vpaddw %ymm8, %ymm11, %ymm11 +vmovdqa %xmm3, 2240(%rsp) +vpshufb shuf48_16(%rip), %ymm10, %ymm10 +vpand mask3_5_4_3_1(%rip), %ymm10, %ymm3 +vpand mask5_3_5_3(%rip), %ymm10, %ymm10 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm10, %ymm10 +vpaddw 2496(%rsp), %ymm6, %ymm6 +vpaddw %ymm10, %ymm6, %ymm6 +vmovdqa %xmm3, 2496(%rsp) +vpshufb shuf48_16(%rip), %ymm5, %ymm5 +vpand mask3_5_4_3_1(%rip), %ymm5, %ymm3 +vpand mask5_3_5_3(%rip), %ymm5, %ymm5 +vpermq $139, %ymm3, %ymm3 +vpand mask_keephigh(%rip), %ymm3, %ymm4 +vpor %ymm4, %ymm5, %ymm5 +vpaddw 2752(%rsp), %ymm7, %ymm7 +vpaddw %ymm5, %ymm7, %ymm7 +vmovdqa %xmm3, 2752(%rsp) +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %xmm11, 592(%rdi) +vextracti128 $1, %ymm11, %xmm11 +vmovq %xmm11, 608(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 944(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 960(%rdi) +vpand mask_mod8192(%rip), %ymm7, %ymm7 +vmovdqu %xmm7, 1296(%rdi) +vextracti128 $1, %ymm7, %xmm7 +vmovq %xmm7, 1312(%rdi) +vmovdqa 224(%rsp), %ymm5 +vpunpcklwd const0(%rip), %ymm5, %ymm10 +vpunpckhwd const0(%rip), %ymm5, %ymm8 +vpslld $1, %ymm10, %ymm10 +vpslld $1, %ymm8, %ymm8 +vmovdqa 480(%rsp), %ymm9 +vpunpcklwd const0(%rip), %ymm9, %ymm7 +vpunpckhwd const0(%rip), %ymm9, %ymm9 +vmovdqa 736(%rsp), %ymm6 +vpunpcklwd const0(%rip), %ymm6, %ymm11 +vpunpckhwd const0(%rip), %ymm6, %ymm6 +vpaddd %ymm11, %ymm7, %ymm3 +vpaddd %ymm6, %ymm9, %ymm4 +vpsubd %ymm10, %ymm3, %ymm3 +vpsubd %ymm8, %ymm4, %ymm4 +vpsubd %ymm11, %ymm7, %ymm11 +vpsubd %ymm6, %ymm9, %ymm6 +vpsrld $1, %ymm11, %ymm11 +vpsrld $1, %ymm6, %ymm6 +vpand mask32_to_16(%rip), %ymm11, %ymm11 +vpand mask32_to_16(%rip), %ymm6, %ymm6 +vpackusdw %ymm6, %ymm11, %ymm6 +vmovdqa 1760(%rsp), %ymm11 +vpunpcklwd const0(%rip), %ymm11, %ymm9 +vpunpckhwd const0(%rip), %ymm11, %ymm7 +vpslld $1, %ymm9, %ymm9 +vpslld $1, %ymm7, %ymm7 +vpsubd %ymm9, %ymm3, %ymm3 +vpsubd %ymm7, %ymm4, %ymm4 +vpsrld $1, %ymm3, %ymm3 +vpsrld $1, %ymm4, %ymm4 +vpand mask32_to_16(%rip), %ymm3, %ymm3 +vpand mask32_to_16(%rip), %ymm4, %ymm4 +vpackusdw %ymm4, %ymm3, %ymm4 +vmovdqa 992(%rsp), %ymm3 +vpaddw 1248(%rsp), %ymm3, %ymm7 +vpsubw 1248(%rsp), %ymm3, %ymm3 +vpsrlw $2, %ymm3, %ymm3 +vpsubw %ymm6, %ymm3, %ymm3 +vpmullw %ymm14, %ymm3, %ymm3 +vpsllw $1, %ymm5, %ymm9 +vpsubw %ymm9, %ymm7, %ymm9 +vpsllw $7, %ymm11, %ymm7 +vpsubw %ymm7, %ymm9, %ymm7 +vpsrlw $3, %ymm7, %ymm7 +vpsubw %ymm4, %ymm7, %ymm7 +vmovdqa 1504(%rsp), %ymm9 +vpsubw %ymm5, %ymm9, %ymm9 +vpmullw %ymm15, %ymm11, %ymm8 +vpsubw %ymm8, %ymm9, %ymm8 +vpmullw %ymm14, %ymm7, %ymm7 +vpsubw %ymm7, %ymm4, %ymm4 +vpmullw %ymm12, %ymm7, %ymm9 +vpaddw %ymm9, %ymm4, %ymm9 +vpmullw %ymm12, %ymm9, %ymm9 +vpsubw %ymm9, %ymm8, %ymm9 +vpmullw %ymm14, %ymm9, %ymm9 +vpsubw %ymm6, %ymm9, %ymm9 +vpsrlw $3, %ymm9, %ymm9 +vpsubw %ymm3, %ymm9, %ymm9 +vpsubw %ymm9, %ymm3, %ymm3 +vpsubw %ymm3, %ymm6, %ymm6 +vpmullw %ymm13, %ymm9, %ymm9 +vpsubw %ymm9, %ymm6, %ymm6 +vextracti128 $1, %ymm4, %xmm8 +vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8 +vmovdqa %ymm8, 2816(%rsp) +vextracti128 $1, %ymm3, %xmm8 +vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8 +vmovdqa %ymm8, 2848(%rsp) +vextracti128 $1, %ymm7, %xmm8 +vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8 +vmovdqa %ymm8, 2880(%rsp) +vmovdqu 680(%rdi), %ymm8 +vmovdqu 1032(%rdi), %ymm10 +vmovdqu 1384(%rdi), %ymm2 +vpaddw %ymm5, %ymm8, %ymm5 +vpaddw %ymm6, %ymm10, %ymm6 +vpaddw %ymm4, %ymm2, %ymm4 +vpshufb shuf48_16(%rip), %ymm3, %ymm3 +vpand mask3_5_4_3_1(%rip), %ymm3, %ymm2 +vpand mask5_3_5_3(%rip), %ymm3, %ymm3 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm3, %ymm3 +vmovdqu 328(%rdi), %ymm10 +vpaddw 2016(%rsp), %ymm10, %ymm10 +vpaddw %ymm3, %ymm10, %ymm10 +vpand mask_mod8192(%rip), %ymm10, %ymm10 +vmovdqu %xmm10, 328(%rdi) +vextracti128 $1, %ymm10, %xmm10 +vmovq %xmm10, 344(%rdi) +vpshufb shufmin1_mask3(%rip), %ymm10, %ymm10 +vmovdqa %xmm10, 1792(%rsp) +vmovdqa %xmm2, 2016(%rsp) +vpshufb shuf48_16(%rip), %ymm7, %ymm7 +vpand mask3_5_4_3_1(%rip), %ymm7, %ymm2 +vpand mask5_3_5_3(%rip), %ymm7, %ymm7 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm7, %ymm7 +vpaddw 2272(%rsp), %ymm5, %ymm5 +vpaddw %ymm7, %ymm5, %ymm5 +vmovdqa %xmm2, 2272(%rsp) +vpshufb shuf48_16(%rip), %ymm9, %ymm9 +vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2 +vpand mask5_3_5_3(%rip), %ymm9, %ymm9 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm9, %ymm9 +vpaddw 2528(%rsp), %ymm6, %ymm6 +vpaddw %ymm9, %ymm6, %ymm6 +vmovdqa %xmm2, 2528(%rsp) +vpshufb shuf48_16(%rip), %ymm11, %ymm11 +vpand mask3_5_4_3_1(%rip), %ymm11, %ymm2 +vpand mask5_3_5_3(%rip), %ymm11, %ymm11 +vpermq $139, %ymm2, %ymm2 +vpand mask_keephigh(%rip), %ymm2, %ymm10 +vpor %ymm10, %ymm11, %ymm11 +vpaddw 2784(%rsp), %ymm4, %ymm4 +vpaddw %ymm11, %ymm4, %ymm4 +vmovdqa %xmm2, 2784(%rsp) +vpand mask_mod8192(%rip), %ymm5, %ymm5 +vmovdqu %xmm5, 680(%rdi) +vextracti128 $1, %ymm5, %xmm5 +vmovq %xmm5, 696(%rdi) +vpand mask_mod8192(%rip), %ymm6, %ymm6 +vmovdqu %xmm6, 1032(%rdi) +vextracti128 $1, %ymm6, %xmm6 +vmovq %xmm6, 1048(%rdi) +vpand mask_mod8192(%rip), %ymm4, %ymm4 +vmovdqu %xmm4, 1384(%rdi) +vextracti128 $1, %ymm4, %xmm4 +vpextrw $0, %xmm4, 1400(%rdi) +vmovdqu 0(%rdi), %ymm11 +vpaddw 1888(%rsp), %ymm11, %ymm11 +vpaddw 2816(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 0(%rdi) +vmovdqu 352(%rdi), %ymm11 +vpaddw 2528(%rsp), %ymm11, %ymm11 +vpaddw 2848(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 352(%rdi) +vmovdqu 704(%rdi), %ymm11 +vpaddw 2784(%rsp), %ymm11, %ymm11 +vpaddw 2880(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 704(%rdi) +vmovdqu 88(%rdi), %ymm11 +vpaddw 2048(%rsp), %ymm11, %ymm11 +vpaddw 1920(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 88(%rdi) +vmovdqu 440(%rdi), %ymm11 +vpaddw 2304(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 440(%rdi) +vmovdqu 792(%rdi), %ymm11 +vpaddw 2560(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 792(%rdi) +vmovdqu 176(%rdi), %ymm11 +vpaddw 2080(%rsp), %ymm11, %ymm11 +vpaddw 1952(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 176(%rdi) +vmovdqu 528(%rdi), %ymm11 +vpaddw 2336(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 528(%rdi) +vmovdqu 880(%rdi), %ymm11 +vpaddw 2592(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 880(%rdi) +vmovdqu 264(%rdi), %ymm11 +vpaddw 2112(%rsp), %ymm11, %ymm11 +vpaddw 1984(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 264(%rdi) +vmovdqu 616(%rdi), %ymm11 +vpaddw 2368(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 616(%rdi) +vmovdqu 968(%rdi), %ymm11 +vpaddw 2624(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 968(%rdi) +vmovdqu 352(%rdi), %ymm11 +vpaddw 2144(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 352(%rdi) +vmovdqu 704(%rdi), %ymm11 +vpaddw 2400(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 704(%rdi) +vmovdqu 1056(%rdi), %ymm11 +vpaddw 2656(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1056(%rdi) +vmovdqu 440(%rdi), %ymm11 +vpaddw 2176(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 440(%rdi) +vmovdqu 792(%rdi), %ymm11 +vpaddw 2432(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 792(%rdi) +vmovdqu 1144(%rdi), %ymm11 +vpaddw 2688(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1144(%rdi) +vmovdqu 528(%rdi), %ymm11 +vpaddw 2208(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 528(%rdi) +vmovdqu 880(%rdi), %ymm11 +vpaddw 2464(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 880(%rdi) +vmovdqu 1232(%rdi), %ymm11 +vpaddw 2720(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1232(%rdi) +vmovdqu 616(%rdi), %ymm11 +vpaddw 2240(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 616(%rdi) +vmovdqu 968(%rdi), %ymm11 +vpaddw 2496(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 968(%rdi) +vmovdqu 1320(%rdi), %ymm11 +vpaddw 2752(%rsp), %ymm11, %ymm11 +vpand mask_mod8192(%rip), %ymm11, %ymm11 +vmovdqu %ymm11, 1320(%rdi) +mov %r8, %rsp +pop %r12 +pop %rbp +ret +.cfi_endproc + +#endif diff --git a/src/crypto/hrss/hrss.c b/src/crypto/hrss/hrss.c new file mode 100644 index 00000000..dd3f979c --- /dev/null +++ b/src/crypto/hrss/hrss.c @@ -0,0 +1,2237 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include <openssl/hrss.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> + +#include <openssl/bn.h> +#include <openssl/cpu.h> +#include <openssl/hmac.h> +#include <openssl/mem.h> +#include <openssl/sha.h> + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +#include <emmintrin.h> +#endif + +#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ + (defined(__ARM_NEON__) || defined(__ARM_NEON)) +#include <arm_neon.h> +#endif + +#if defined(_MSC_VER) +#define RESTRICT +#else +#define RESTRICT restrict +#endif + +#include "../internal.h" +#include "internal.h" + +// This is an implementation of [HRSS], but with a KEM transformation based on +// [SXY]. The primary references are: + +// HRSS: https://eprint.iacr.org/2017/667.pdf +// HRSSNIST: +// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip +// SXY: https://eprint.iacr.org/2017/1005.pdf +// NTRUTN14: +// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf + + +// Vector operations. +// +// A couple of functions in this file can use vector operations to meaningful +// effect. If we're building for a target that has a supported vector unit, +// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a +// 128-bit vector. The following functions abstract over the differences between +// NEON and SSE2 for implementing some vector operations. + +// TODO: MSVC can likely also be made to work with vector operations. +#if (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) && \ + (defined(__clang__) || !defined(_MSC_VER)) + +#define HRSS_HAVE_VECTOR_UNIT +typedef __m128i vec_t; + +// vec_capable returns one iff the current platform supports SSE2. +static int vec_capable(void) { +#if defined(__SSE2__) + return 1; +#else + int has_sse2 = (OPENSSL_ia32cap_P[0] & (1 << 26)) != 0; + return has_sse2; +#endif +} + +// vec_add performs a pair-wise addition of four uint16s from |a| and |b|. +static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); } + +// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|. +static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); } + +// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting +// vector. +static inline vec_t vec_mul(vec_t a, uint16_t b) { + return _mm_mullo_epi16(a, _mm_set1_epi16(b)); +} + +// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and +// returns the resulting vector. +static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { + return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c))); +} + +// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16. +static inline void vec3_rshift_word(vec_t v[3]) { + // Intel's left and right shifting is backwards compared to the order in + // memory because they're based on little-endian order of words (and not just + // bytes). So the shifts in this function will be backwards from what one + // might expect. + const __m128i carry0 = _mm_srli_si128(v[0], 14); + v[0] = _mm_slli_si128(v[0], 2); + + const __m128i carry1 = _mm_srli_si128(v[1], 14); + v[1] = _mm_slli_si128(v[1], 2); + v[1] |= carry0; + + v[2] = _mm_slli_si128(v[2], 2); + v[2] |= carry1; +} + +// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16. +static inline void vec4_rshift_word(vec_t v[4]) { + // Intel's left and right shifting is backwards compared to the order in + // memory because they're based on little-endian order of words (and not just + // bytes). So the shifts in this function will be backwards from what one + // might expect. + const __m128i carry0 = _mm_srli_si128(v[0], 14); + v[0] = _mm_slli_si128(v[0], 2); + + const __m128i carry1 = _mm_srli_si128(v[1], 14); + v[1] = _mm_slli_si128(v[1], 2); + v[1] |= carry0; + + const __m128i carry2 = _mm_srli_si128(v[2], 14); + v[2] = _mm_slli_si128(v[2], 2); + v[2] |= carry1; + + v[3] = _mm_slli_si128(v[3], 2); + v[3] |= carry2; +} + +// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first +// five from |right|, and returns the resulting vector. +static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { + return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6); +} + +// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one +// bit. +static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + + for (int i = 0; i < 6; i++) { + vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63); + a_s[i] = _mm_slli_epi64(a_s[i], 1); + a_s[i] |= _mm_slli_si128(next_carry_s, 8); + a_s[i] |= carry_s; + carry_s = _mm_srli_si128(next_carry_s, 8); + + vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63); + a_a[i] = _mm_slli_epi64(a_a[i], 1); + a_a[i] |= _mm_slli_si128(next_carry_a, 8); + a_a[i] |= carry_a; + carry_a = _mm_srli_si128(next_carry_a, 8); + } +} + +// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one +// bit. +static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + + for (int i = 5; i >= 0; i--) { + const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63); + a_s[i] = _mm_srli_epi64(a_s[i], 1); + a_s[i] |= _mm_srli_si128(next_carry_s, 8); + a_s[i] |= carry_s; + carry_s = _mm_slli_si128(next_carry_s, 8); + + const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63); + a_a[i] = _mm_srli_epi64(a_a[i], 1); + a_a[i] |= _mm_srli_si128(next_carry_a, 8); + a_a[i] |= carry_a; + carry_a = _mm_slli_si128(next_carry_a, 8); + } +} + +// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in +// a vector and returns the result. +static inline vec_t vec_broadcast_bit(vec_t a) { + return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31), + 0b01010101); +} + +// vec_broadcast_bit15 duplicates the most-significant bit of the first word in +// |a| to all bits in a vector and returns the result. +static inline vec_t vec_broadcast_bit15(vec_t a) { + return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63 - 15), 31), + 0b01010101); +} + +// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the +// compiler requires that |i| be a compile-time constant.) +#define vec_get_word(v, i) _mm_extract_epi16(v, i) + +#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ + (defined(__ARM_NEON__) || defined(__ARM_NEON)) + +#define HRSS_HAVE_VECTOR_UNIT +typedef uint16x8_t vec_t; + +// These functions perform the same actions as the SSE2 function of the same +// name, above. + +static int vec_capable(void) { return CRYPTO_is_NEON_capable(); } + +static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; } + +static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; } + +static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); } + +static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { + return vmlaq_n_u16(a, b, c); +} + +static inline void vec3_rshift_word(vec_t v[3]) { + const uint16x8_t kZero = {0}; + v[2] = vextq_u16(v[1], v[2], 7); + v[1] = vextq_u16(v[0], v[1], 7); + v[0] = vextq_u16(kZero, v[0], 7); +} + +static inline void vec4_rshift_word(vec_t v[4]) { + const uint16x8_t kZero = {0}; + v[3] = vextq_u16(v[2], v[3], 7); + v[2] = vextq_u16(v[1], v[2], 7); + v[1] = vextq_u16(v[0], v[1], 7); + v[0] = vextq_u16(kZero, v[0], 7); +} + +static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { + return vextq_u16(left, right, 5); +} + +static inline uint16_t vec_get_word(vec_t v, unsigned i) { + return v[i]; +} + +#if !defined(OPENSSL_AARCH64) + +static inline vec_t vec_broadcast_bit(vec_t a) { + a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15); + return vdupq_lane_u16(vget_low_u16(a), 0); +} + +static inline vec_t vec_broadcast_bit15(vec_t a) { + a = (vec_t)vshrq_n_s16((int16x8_t)a, 15); + return vdupq_lane_u16(vget_low_u16(a), 0); +} + +static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + const vec_t kZero = {0}; + + for (int i = 0; i < 6; i++) { + vec_t next_carry_s = a_s[i] >> 15; + a_s[i] <<= 1; + a_s[i] |= vextq_u16(kZero, next_carry_s, 7); + a_s[i] |= carry_s; + carry_s = vextq_u16(next_carry_s, kZero, 7); + + vec_t next_carry_a = a_a[i] >> 15; + a_a[i] <<= 1; + a_a[i] |= vextq_u16(kZero, next_carry_a, 7); + a_a[i] |= carry_a; + carry_a = vextq_u16(next_carry_a, kZero, 7); + } +} + +static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + const vec_t kZero = {0}; + + for (int i = 5; i >= 0; i--) { + vec_t next_carry_s = a_s[i] << 15; + a_s[i] >>= 1; + a_s[i] |= vextq_u16(next_carry_s, kZero, 1); + a_s[i] |= carry_s; + carry_s = vextq_u16(kZero, next_carry_s, 1); + + vec_t next_carry_a = a_a[i] << 15; + a_a[i] >>= 1; + a_a[i] |= vextq_u16(next_carry_a, kZero, 1); + a_a[i] |= carry_a; + carry_a = vextq_u16(kZero, next_carry_a, 1); + } +} + +#endif // !OPENSSL_AARCH64 + +#endif // (ARM || AARCH64) && NEON + +// Polynomials in this scheme have N terms. +// #define N 701 + +// Underlying data types and arithmetic operations. +// ------------------------------------------------ + +// Binary polynomials. + +// poly2 represents a degree-N polynomial over GF(2). The words are in little- +// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The +// final word is only partially used since N is not a multiple of the word size. + +// Defined in internal.h: +// struct poly2 { +// crypto_word_t v[WORDS_PER_POLY]; +// }; + +OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) { + const uint8_t *in = (const uint8_t *)void_in; + for (size_t i = 0; i < len; i++) { + printf("%02x", in[i]); + } + printf("\n"); +} + +static void poly2_zero(struct poly2 *p) { + OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY); +} + +// poly2_cmov sets |out| to |in| iff |mov| is all ones. +static void poly2_cmov(struct poly2 *out, const struct poly2 *in, + crypto_word_t mov) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + out->v[i] = (out->v[i] & ~mov) | (in->v[i] & mov); + } +} + +// poly2_rotr_words performs a right-rotate on |in|, writing the result to +// |out|. The shift count, |bits|, must be a non-zero multiple of the word size. +static void poly2_rotr_words(struct poly2 *out, const struct poly2 *in, + size_t bits) { + assert(bits >= BITS_PER_WORD && bits % BITS_PER_WORD == 0); + assert(out != in); + + const size_t start = bits / BITS_PER_WORD; + const size_t n = (N - bits) / BITS_PER_WORD; + + // The rotate is by a whole number of words so the first few words are easy: + // just move them down. + for (size_t i = 0; i < n; i++) { + out->v[i] = in->v[start + i]; + } + + // Since the last word is only partially filled, however, the remainder needs + // shifting and merging of words to take care of that. + crypto_word_t carry = in->v[WORDS_PER_POLY - 1]; + + for (size_t i = 0; i < start; i++) { + out->v[n + i] = carry | in->v[i] << BITS_IN_LAST_WORD; + carry = in->v[i] >> (BITS_PER_WORD - BITS_IN_LAST_WORD); + } + + out->v[WORDS_PER_POLY - 1] = carry; +} + +// poly2_rotr_bits performs a right-rotate on |in|, writing the result to |out|. +// The shift count, |bits|, must be a power of two that is less than +// |BITS_PER_WORD|. +static void poly2_rotr_bits(struct poly2 *out, const struct poly2 *in, + size_t bits) { + assert(bits <= BITS_PER_WORD / 2); + assert(bits != 0); + assert((bits & (bits - 1)) == 0); + assert(out != in); + + // BITS_PER_WORD/2 is the greatest legal value of |bits|. If + // |BITS_IN_LAST_WORD| is smaller than this then the code below doesn't work + // because more than the last word needs to carry down in the previous one and + // so on. + OPENSSL_STATIC_ASSERT( + BITS_IN_LAST_WORD >= BITS_PER_WORD / 2, + "there are more carry bits than fit in BITS_IN_LAST_WORD"); + + crypto_word_t carry = in->v[WORDS_PER_POLY - 1] << (BITS_PER_WORD - bits); + + for (size_t i = WORDS_PER_POLY - 2; i < WORDS_PER_POLY; i--) { + out->v[i] = carry | in->v[i] >> bits; + carry = in->v[i] << (BITS_PER_WORD - bits); + } + + crypto_word_t last_word = carry >> (BITS_PER_WORD - BITS_IN_LAST_WORD) | + in->v[WORDS_PER_POLY - 1] >> bits; + last_word &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; + out->v[WORDS_PER_POLY - 1] = last_word; +} + +// HRSS_poly2_rotr_consttime right-rotates |p| by |bits| in constant-time. +void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits) { + assert(bits <= N); + assert(p->v[WORDS_PER_POLY-1] >> BITS_IN_LAST_WORD == 0); + + // Constant-time rotation is implemented by calculating the rotations of + // powers-of-two bits and throwing away the unneeded values. 2^9 (i.e. 512) is + // the largest power-of-two shift that we need to consider because 2^10 > N. +#define HRSS_POLY2_MAX_SHIFT 9 + size_t shift = HRSS_POLY2_MAX_SHIFT; + OPENSSL_STATIC_ASSERT((1 << (HRSS_POLY2_MAX_SHIFT + 1)) > N, + "maximum shift is too small"); + OPENSSL_STATIC_ASSERT((1 << HRSS_POLY2_MAX_SHIFT) <= N, + "maximum shift is too large"); + struct poly2 shifted; + + for (; (UINT64_C(1) << shift) >= BITS_PER_WORD; shift--) { + poly2_rotr_words(&shifted, p, UINT64_C(1) << shift); + poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1)); + } + + for (; shift < HRSS_POLY2_MAX_SHIFT; shift--) { + poly2_rotr_bits(&shifted, p, UINT64_C(1) << shift); + poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1)); + } +#undef HRSS_POLY2_MAX_SHIFT +} + +// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones. +static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]); + a->v[i] ^= sum; + b->v[i] ^= sum; + } +} + +// poly2_fmadd sets |out| to |out| + |in| * m, where m is either +// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|. +static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, + crypto_word_t m) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + out->v[i] ^= in->v[i] & m; + } +} + +// poly2_lshift1 left-shifts |p| by one bit. +static void poly2_lshift1(struct poly2 *p) { + crypto_word_t carry = 0; + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1); + p->v[i] <<= 1; + p->v[i] |= carry; + carry = next_carry; + } +} + +// poly2_rshift1 right-shifts |p| by one bit. +static void poly2_rshift1(struct poly2 *p) { + crypto_word_t carry = 0; + for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) { + const crypto_word_t next_carry = p->v[i] & 1; + p->v[i] >>= 1; + p->v[i] |= carry << (BITS_PER_WORD - 1); + carry = next_carry; + } +} + +// poly2_clear_top_bits clears the bits in the final word that are only for +// alignment. +static void poly2_clear_top_bits(struct poly2 *p) { + p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; +} + +// poly2_top_bits_are_clear returns one iff the extra bits in the final words of +// |p| are zero. +static int poly2_top_bits_are_clear(const struct poly2 *p) { + return (p->v[WORDS_PER_POLY - 1] & + ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0; +} + +// Ternary polynomials. + +// poly3 represents a degree-N polynomial over GF(3). Each coefficient is +// bitsliced across the |s| and |a| arrays, like this: +// +// s | a | value +// ----------------- +// 0 | 0 | 0 +// 0 | 1 | 1 +// 1 | 0 | 2 (aka -1) +// 1 | 1 | <invalid> +// +// ('s' is for sign, and 'a' just a letter.) +// +// Once bitsliced as such, the following circuits can be used to implement +// addition and multiplication mod 3: +// +// (s3, a3) = (s1, a1) × (s2, a2) +// s3 = (a1 ∧ s2) ⊕ (s1 ∧ a2) +// a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2) +// +// (s3, a3) = (s1, a1) + (s2, a2) +// x = (a1 ⊕ a2) +// y = (s1 ⊕ s2) ⊕ (a1 ∧ a2) +// z = (s1 ∧ s2) +// s3 = y ∧ ¬x +// a3 = z ∨ (x ∧ ¬y) +// +// Negating a value just involves swapping s and a. +// struct poly3 { +// struct poly2 s, a; +// }; + +OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) { + struct poly3 p; + OPENSSL_memcpy(&p, in, sizeof(p)); + p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; + p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; + + printf("{["); + for (unsigned i = 0; i < WORDS_PER_POLY; i++) { + if (i) { + printf(" "); + } + printf(BN_HEX_FMT2, p.s.v[i]); + } + printf("] ["); + for (unsigned i = 0; i < WORDS_PER_POLY; i++) { + if (i) { + printf(" "); + } + printf(BN_HEX_FMT2, p.a.v[i]); + } + printf("]}\n"); +} + +static void poly3_zero(struct poly3 *p) { + poly2_zero(&p->s); + poly2_zero(&p->a); +} + +// lsb_to_all replicates the least-significant bit of |v| to all bits of the +// word. This is used in bit-slicing operations to make a vector from a fixed +// value. +static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); } + +// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma). +static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, + crypto_word_t ma) { + ms = lsb_to_all(ms); + ma = lsb_to_all(ma); + + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t s = p->s.v[i]; + const crypto_word_t a = p->a.v[i]; + p->s.v[i] = (s & ma) ^ (ms & a); + p->a.v[i] = (ms & s) ^ (ma & a); + } +} + +// poly3_rotr_consttime right-rotates |p| by |bits| in constant-time. +static void poly3_rotr_consttime(struct poly3 *p, size_t bits) { + assert(bits <= N); + HRSS_poly2_rotr_consttime(&p->s, bits); + HRSS_poly2_rotr_consttime(&p->a, bits); +} + +// poly3_fmadd sets |out| to |out| + |in|×m, where m is (ms, ma). +static void poly3_fmadd(struct poly3 *RESTRICT out, + const struct poly3 *RESTRICT in, crypto_word_t ms, + crypto_word_t ma) { + // (See the multiplication and addition circuits given above.) + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t s = in->s.v[i]; + const crypto_word_t a = in->a.v[i]; + const crypto_word_t product_s = (s & ma) ^ (ms & a); + const crypto_word_t product_a = (ms & s) ^ (ma & a); + + const crypto_word_t x = out->a.v[i] ^ product_a; + const crypto_word_t y = + (out->s.v[i] ^ product_s) ^ (out->a.v[i] & product_a); + const crypto_word_t z = (out->s.v[i] & product_s); + out->s.v[i] = y & ~x; + out->a.v[i] = z | (x & ~y); + } +} + +// final_bit_to_all replicates the bit in the final position of the last word to +// all the bits in the word. +static crypto_word_t final_bit_to_all(crypto_word_t v) { + return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1)); +} + +// poly3_top_bits_are_clear returns one iff the extra bits in the final words of +// |p| are zero. +OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) { + return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a); +} + +// poly3_mod_phiN reduces |p| by Φ(N). +static void poly3_mod_phiN(struct poly3 *p) { + // In order to reduce by Φ(N) we subtract by the value of the greatest + // coefficient. That's the same as adding the negative of its value. The + // negative of (s, a) is (a, s), so the arguments are swapped in the following + // two lines. + const crypto_word_t factor_s = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]); + const crypto_word_t factor_a = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]); + + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t s = p->s.v[i]; + const crypto_word_t a = p->a.v[i]; + const crypto_word_t x = a ^ factor_a; + const crypto_word_t y = (s ^ factor_s) ^ (a & factor_a); + const crypto_word_t z = (s & factor_s); + p->s.v[i] = y & ~x; + p->a.v[i] = z | (x & ~y); + } + + poly2_clear_top_bits(&p->s); + poly2_clear_top_bits(&p->a); +} + +static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) { + poly2_cswap(&a->s, &b->s, swap); + poly2_cswap(&a->a, &b->a, swap); +} + +static void poly3_lshift1(struct poly3 *p) { + poly2_lshift1(&p->s); + poly2_lshift1(&p->a); +} + +static void poly3_rshift1(struct poly3 *p) { + poly2_rshift1(&p->s); + poly2_rshift1(&p->a); +} + +// poly3_span represents a pointer into a poly3. +struct poly3_span { + crypto_word_t *s; + crypto_word_t *a; +}; + +// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|). +static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, + const crypto_word_t s1, const crypto_word_t a1, + const crypto_word_t s2, const crypto_word_t a2) { + const crypto_word_t x = a1 ^ a2; + const crypto_word_t y = (s1 ^ s2) ^ (a1 & a2); + const crypto_word_t z = s1 & s2; + *out_s = y & ~x; + *out_a = z | (x & ~y); +} + +// poly3_span_add adds |n| words of values from |a| and |b| and writes the +// result to |out|. +static void poly3_span_add(const struct poly3_span *out, + const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + for (size_t i = 0; i < n; i++) { + poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); + } +} + +// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|. +static void poly3_span_sub(const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + for (size_t i = 0; i < n; i++) { + // Swapping |b->s| and |b->a| negates the value being added. + poly3_word_add(&a->s[i], &a->a[i], a->s[i], a->a[i], b->a[i], b->s[i]); + } +} + +// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and +// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of +// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't +// used and the recursion stops. For |n| in {11, 22}, the transitive total +// amount of |scratch| needed happens to be 2n+2. +static void poly3_mul_aux(const struct poly3_span *out, + const struct poly3_span *scratch, + const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + if (n == 1) { + crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0; + crypto_word_t b_s = b->s[0], b_a = b->a[0]; + const crypto_word_t a_s = a->s[0], a_a = a->a[0]; + + for (size_t i = 0; i < BITS_PER_WORD; i++) { + // Multiply (s, a) by the next value from (b_s, b_a). + const crypto_word_t v_s = lsb_to_all(b_s); + const crypto_word_t v_a = lsb_to_all(b_a); + b_s >>= 1; + b_a >>= 1; + + const crypto_word_t m_s = (v_s & a_a) ^ (a_s & v_a); + const crypto_word_t m_a = (a_s & v_s) ^ (a_a & v_a); + + if (i == 0) { + // Special case otherwise the code tries to shift by BITS_PER_WORD + // below, which is undefined. + r_s_low = m_s; + r_a_low = m_a; + continue; + } + + // Shift the multiplication result to the correct position. + const crypto_word_t m_s_low = m_s << i; + const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i); + const crypto_word_t m_a_low = m_a << i; + const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i); + + // Add into the result. + poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low); + poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high, + m_a_high); + } + + out->s[0] = r_s_low; + out->s[1] = r_s_high; + out->a[0] = r_a_low; + out->a[1] = r_a_high; + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The first + // is always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]}; + const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]}; + + // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second + // half. + const struct poly3_span a_cross_sum = *out; + const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]}; + poly3_span_add(&a_cross_sum, a, &a_high, low_len); + poly3_span_add(&b_cross_sum, b, &b_high, low_len); + if (high_len != low_len) { + a_cross_sum.s[low_len] = a_high.s[low_len]; + a_cross_sum.a[low_len] = a_high.a[low_len]; + b_cross_sum.s[low_len] = b_high.s[low_len]; + b_cross_sum.a[low_len] = b_high.a[low_len]; + } + + const struct poly3_span child_scratch = {&scratch->s[2 * high_len], + &scratch->a[2 * high_len]}; + const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]}; + const struct poly3_span out_high = {&out->s[2 * low_len], + &out->a[2 * low_len]}; + + // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. + poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len); + // Calculate a_1 × b_1. + poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len); + // Calculate a_0 × b_0. + poly3_mul_aux(out, &child_scratch, a, b, low_len); + + // Subtract those last two products from the first. + poly3_span_sub(scratch, out, low_len * 2); + poly3_span_sub(scratch, &out_high, high_len * 2); + + // Add the middle product into the output. + poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2); +} + +// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N). +void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, + const struct poly3 *y) { + crypto_word_t prod_s[WORDS_PER_POLY * 2]; + crypto_word_t prod_a[WORDS_PER_POLY * 2]; + crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2]; + crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2]; + const struct poly3_span prod_span = {prod_s, prod_a}; + const struct poly3_span scratch_span = {scratch_s, scratch_a}; + const struct poly3_span x_span = {(crypto_word_t *)x->s.v, + (crypto_word_t *)x->a.v}; + const struct poly3_span y_span = {(crypto_word_t *)y->s.v, + (crypto_word_t *)y->a.v}; + + poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY); + + // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the + // upper-half to the lower-half. However, N is 701, which isn't a multiple of + // BITS_PER_WORD, so the upper-half vectors all have to be shifted before + // being added to the lower-half. + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; + v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); + crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; + v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); + + poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a); + } + + poly3_mod_phiN(out); +} + +#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) + +// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is +// |0xff..ff|. Otherwise, |swap| must be zero. +static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], + vec_t b_a[6], const vec_t swap) { + for (int i = 0; i < 6; i++) { + const vec_t sum_s = swap & (a_s[i] ^ b_s[i]); + a_s[i] ^= sum_s; + b_s[i] ^= sum_s; + + const vec_t sum_a = swap & (a_a[i] ^ b_a[i]); + a_a[i] ^= sum_a; + b_a[i] ^= sum_a; + } +} + +// poly3_vec_fmadd adds (|ms|, |ma|) × (|b_s|, |b_a|) to (|a_s|, |a_a|). +static inline void poly3_vec_fmadd(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], + vec_t b_a[6], const vec_t ms, + const vec_t ma) { + for (int i = 0; i < 6; i++) { + const vec_t s = b_s[i]; + const vec_t a = b_a[i]; + const vec_t product_s = (s & ma) ^ (ms & a); + const vec_t product_a = (ms & s) ^ (ma & a); + + const vec_t x = a_a[i] ^ product_a; + const vec_t y = (a_s[i] ^ product_s) ^ (a_a[i] & product_a); + const vec_t z = (a_s[i] & product_s); + a_s[i] = y & ~x; + a_a[i] = z | (x & ~y); + } +} + +// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod +// Φ(N). +static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) { + // See the comment in |HRSS_poly3_invert| about this algorithm. In addition to + // the changes described there, this implementation attempts to use vector + // registers to speed up the computation. Even non-poly3 variables are held in + // vectors where possible to minimise the amount of data movement between + // the vector and general-purpose registers. + + vec_t b_s[6], b_a[6], c_s[6], c_a[6], f_s[6], f_a[6], g_s[6], g_a[6]; + const vec_t kZero = {0}; + const vec_t kOne = {1}; + static const uint8_t kOneBytes[sizeof(vec_t)] = {1}; + static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f}; + + memset(b_s, 0, sizeof(b_s)); + memcpy(b_a, kOneBytes, sizeof(kOneBytes)); + memset(&b_a[1], 0, 5 * sizeof(vec_t)); + + memset(c_s, 0, sizeof(c_s)); + memset(c_a, 0, sizeof(c_a)); + + f_s[5] = kZero; + memcpy(f_s, in->s.v, WORDS_PER_POLY * sizeof(crypto_word_t)); + f_a[5] = kZero; + memcpy(f_a, in->a.v, WORDS_PER_POLY * sizeof(crypto_word_t)); + + // Set g to all ones. + memset(g_s, 0, sizeof(g_s)); + memset(g_a, 0xff, 5 * sizeof(vec_t)); + memcpy(&g_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne)); + + vec_t deg_f = {N - 1}, deg_g = {N - 1}, rotation = kZero; + vec_t k = kOne; + vec_t f0s = {0}, f0a = {0}; + vec_t still_going; + memset(&still_going, 0xff, sizeof(still_going)); + + for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) { + const vec_t s_a = vec_broadcast_bit( + still_going & ((f_a[0] & g_s[0]) ^ (f_s[0] & g_a[0]))); + const vec_t s_s = vec_broadcast_bit( + still_going & ((f_a[0] & g_a[0]) ^ (f_s[0] & g_s[0]))); + const vec_t should_swap = + (s_s | s_a) & vec_broadcast_bit15(deg_f - deg_g); + + poly3_vec_cswap(f_s, f_a, g_s, g_a, should_swap); + poly3_vec_fmadd(f_s, f_a, g_s, g_a, s_s, s_a); + poly3_vec_rshift1(f_s, f_a); + + poly3_vec_cswap(b_s, b_a, c_s, c_a, should_swap); + poly3_vec_fmadd(b_s, b_a, c_s, c_a, s_s, s_a); + poly3_vec_lshift1(c_s, c_a); + + const vec_t deg_sum = should_swap & (deg_f ^ deg_g); + deg_f ^= deg_sum; + deg_g ^= deg_sum; + + deg_f -= kOne; + still_going &= ~vec_broadcast_bit15(deg_f - kOne); + + const vec_t f0_is_nonzero = vec_broadcast_bit(f_s[0] | f_a[0]); + // |f0_is_nonzero| implies |still_going|. + rotation ^= f0_is_nonzero & (k ^ rotation); + k += kOne; + + const vec_t f0s_sum = f0_is_nonzero & (f_s[0] ^ f0s); + f0s ^= f0s_sum; + const vec_t f0a_sum = f0_is_nonzero & (f_a[0] ^ f0a); + f0a ^= f0a_sum; + } + + crypto_word_t rotation_word = vec_get_word(rotation, 0); + rotation_word -= N & constant_time_lt_w(N, rotation_word); + memcpy(out->s.v, b_s, WORDS_PER_POLY * sizeof(crypto_word_t)); + memcpy(out->a.v, b_a, WORDS_PER_POLY * sizeof(crypto_word_t)); + assert(poly3_top_bits_are_clear(out)); + poly3_rotr_consttime(out, rotation_word); + poly3_mul_const(out, vec_get_word(f0s, 0), vec_get_word(f0a, 0)); + poly3_mod_phiN(out); +} + +#endif // HRSS_HAVE_VECTOR_UNIT + +// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod +// Φ(N). +void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) { + // The vector version of this function seems slightly slower on AArch64, but + // is useful on ARMv7 and x86-64. +#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) + if (vec_capable()) { + poly3_invert_vec(out, in); + return; + } +#endif + + // This algorithm mostly follows algorithm 10 in the paper. Some changes: + // 1) k should start at zero, not one. In the code below k is omitted and + // the loop counter, |i|, is used instead. + // 2) The rotation count is conditionally updated to handle trailing zero + // coefficients. + // The best explanation for why it works is in the "Why it works" section of + // [NTRUTN14]. + + struct poly3 c, f, g; + OPENSSL_memcpy(&f, in, sizeof(f)); + + // Set g to all ones. + OPENSSL_memset(&g.s, 0, sizeof(struct poly2)); + OPENSSL_memset(&g.a, 0xff, sizeof(struct poly2)); + g.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; + + struct poly3 *b = out; + poly3_zero(b); + poly3_zero(&c); + // Set b to one. + b->a.v[0] = 1; + + crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0; + crypto_word_t f0s = 0, f0a = 0; + crypto_word_t still_going = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) { + const crypto_word_t s_a = lsb_to_all( + still_going & ((f.a.v[0] & g.s.v[0]) ^ (f.s.v[0] & g.a.v[0]))); + const crypto_word_t s_s = lsb_to_all( + still_going & ((f.a.v[0] & g.a.v[0]) ^ (f.s.v[0] & g.s.v[0]))); + const crypto_word_t should_swap = + (s_s | s_a) & constant_time_lt_w(deg_f, deg_g); + + poly3_cswap(&f, &g, should_swap); + poly3_cswap(b, &c, should_swap); + + const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g); + deg_f ^= deg_sum; + deg_g ^= deg_sum; + assert(deg_g >= 1); + + poly3_fmadd(&f, &g, s_s, s_a); + poly3_fmadd(b, &c, s_s, s_a); + poly3_rshift1(&f); + poly3_lshift1(&c); + + deg_f--; + const crypto_word_t f0_is_nonzero = + lsb_to_all(f.s.v[0]) | lsb_to_all(f.a.v[0]); + // |f0_is_nonzero| implies |still_going|. + assert(!(f0_is_nonzero && !still_going)); + still_going &= ~constant_time_is_zero_w(deg_f); + + rotation = constant_time_select_w(f0_is_nonzero, i, rotation); + f0s = constant_time_select_w(f0_is_nonzero, f.s.v[0], f0s); + f0a = constant_time_select_w(f0_is_nonzero, f.a.v[0], f0a); + } + + rotation++; + rotation -= N & constant_time_lt_w(N, rotation); + assert(poly3_top_bits_are_clear(out)); + poly3_rotr_consttime(out, rotation); + poly3_mul_const(out, f0s, f0a); + poly3_mod_phiN(out); +} + +// Polynomials in Q. + +// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the +// coefficients do not form a field.) +#define Q 8192 + +// VECS_PER_POLY is the number of 128-bit vectors needed to represent a +// polynomial. +#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t)) +#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC) + +// poly represents a polynomial with coefficients mod Q. Note that, while Q is a +// power of two, this does not operate in GF(Q). That would be a binary field +// but this is simply mod Q. Thus the coefficients are not a field. +// +// Coefficients are ordered little-endian, thus the coefficient of x^0 is the +// first element of the array. +struct poly { +#if defined(HRSS_HAVE_VECTOR_UNIT) + union { + // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for + // the vector code. + uint16_t v[N + 3]; + vec_t vectors[VECS_PER_POLY]; + }; +#else + // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that + // requires alignment. + alignas(16) uint16_t v[N + 3]; +#endif +}; + +OPENSSL_UNUSED static void poly_print(const struct poly *p) { + printf("["); + for (unsigned i = 0; i < N; i++) { + if (i) { + printf(" "); + } + printf("%d", p->v[i]); + } + printf("]\n"); +} + +#if defined(HRSS_HAVE_VECTOR_UNIT) + +// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a| +// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements +// of |scratch| and the function recurses, except if |n| < 3, when |scratch| +// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch| +// needs 172 elements. +static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch, + const vec_t *restrict a, const vec_t *restrict b, + const size_t n) { + // In [HRSS], the technique they used for polynomial multiplication is + // described: they start with Toom-4 at the top level and then two layers of + // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook + // decomposition, which splits an input n-ways and produces 2n-1 + // multiplications of those parts. So, starting with 704 coefficients (rounded + // up from 701 to have more factors of two), Toom-4 gives seven + // multiplications of degree-174 polynomials. Each round of Karatsuba (which + // is Toom-2) increases the number of multiplications by a factor of three + // while halving the size of the values being multiplied. So two rounds gives + // 63 multiplications of degree-44 polynomials. Then they (I think) form + // vectors by gathering all 63 coefficients of each power together, for each + // input, and doing more rounds of Karatsuba on the vectors until they bottom- + // out somewhere with schoolbook multiplication. + // + // I tried something like that for NEON. NEON vectors are 128 bits so hold + // eight coefficients. I wrote a function that did Karatsuba on eight + // multiplications at the same time, using such vectors, and a Go script that + // decomposed from degree-704, with Karatsuba in non-transposed form, until it + // reached multiplications of degree-44. It batched up those 81 + // multiplications into lots of eight with a single one left over (which was + // handled directly). + // + // It worked, but it was significantly slower than the dumb algorithm used + // below. Potentially that was because I misunderstood how [HRSS] did it, or + // because Clang is bad at generating good code from NEON intrinsics on ARMv7. + // (Which is true: the code generated by Clang for the below is pretty crap.) + // + // This algorithm is much simpler. It just does Karatsuba decomposition all + // the way down and never transposes. When it gets down to degree-16 or + // degree-24 values, they are multiplied using schoolbook multiplication and + // vector intrinsics. The vector operations form each of the eight phase- + // shifts of one of the inputs, point-wise multiply, and then add into the + // result at the correct place. This means that 33% (degree-16) or 25% + // (degree-24) of the multiplies and adds are wasted, but it does ok. + if (n == 2) { + vec_t result[4]; + vec_t vec_a[3]; + static const vec_t kZero = {0}; + vec_a[0] = a[0]; + vec_a[1] = a[1]; + vec_a[2] = kZero; + + result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0)); + result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0)); + + result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0)); + result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0)); + result[3] = kZero; + + vec3_rshift_word(vec_a); + +#define BLOCK(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = \ + vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK(0, 1); + BLOCK(1, 9); + + vec3_rshift_word(vec_a); + + BLOCK(0, 2); + BLOCK(1, 10); + + vec3_rshift_word(vec_a); + + BLOCK(0, 3); + BLOCK(1, 11); + + vec3_rshift_word(vec_a); + + BLOCK(0, 4); + BLOCK(1, 12); + + vec3_rshift_word(vec_a); + + BLOCK(0, 5); + BLOCK(1, 13); + + vec3_rshift_word(vec_a); + + BLOCK(0, 6); + BLOCK(1, 14); + + vec3_rshift_word(vec_a); + + BLOCK(0, 7); + BLOCK(1, 15); + +#undef BLOCK + + memcpy(out, result, sizeof(result)); + return; + } + + if (n == 3) { + vec_t result[6]; + vec_t vec_a[4]; + static const vec_t kZero = {0}; + vec_a[0] = a[0]; + vec_a[1] = a[1]; + vec_a[2] = a[2]; + vec_a[3] = kZero; + + result[0] = vec_mul(a[0], vec_get_word(b[0], 0)); + result[1] = vec_mul(a[1], vec_get_word(b[0], 0)); + result[2] = vec_mul(a[2], vec_get_word(b[0], 0)); + +#define BLOCK_PRE(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK_PRE(1, 8); + BLOCK_PRE(2, 16); + + result[5] = kZero; + + vec4_rshift_word(vec_a); + +#define BLOCK(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = \ + vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + result[x + 3] = \ + vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK(0, 1); + BLOCK(1, 9); + BLOCK(2, 17); + + vec4_rshift_word(vec_a); + + BLOCK(0, 2); + BLOCK(1, 10); + BLOCK(2, 18); + + vec4_rshift_word(vec_a); + + BLOCK(0, 3); + BLOCK(1, 11); + BLOCK(2, 19); + + vec4_rshift_word(vec_a); + + BLOCK(0, 4); + BLOCK(1, 12); + BLOCK(2, 20); + + vec4_rshift_word(vec_a); + + BLOCK(0, 5); + BLOCK(1, 13); + BLOCK(2, 21); + + vec4_rshift_word(vec_a); + + BLOCK(0, 6); + BLOCK(1, 14); + BLOCK(2, 22); + + vec4_rshift_word(vec_a); + + BLOCK(0, 7); + BLOCK(1, 15); + BLOCK(2, 23); + +#undef BLOCK +#undef BLOCK_PRE + + memcpy(out, result, sizeof(result)); + + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The first is + // always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const vec_t *a_high = &a[low_len]; + const vec_t *b_high = &b[low_len]; + + // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second + // half. + for (size_t i = 0; i < low_len; i++) { + out[i] = vec_add(a_high[i], a[i]); + out[high_len + i] = vec_add(b_high[i], b[i]); + } + if (high_len != low_len) { + out[low_len] = a_high[low_len]; + out[high_len + low_len] = b_high[low_len]; + } + + vec_t *const child_scratch = &scratch[2 * high_len]; + // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. + poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len); + // Calculate a_1 × b_1. + poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len); + // Calculate a_0 × b_0. + poly_mul_vec_aux(out, child_scratch, a, b, low_len); + + // Subtract those last two products from the first. + for (size_t i = 0; i < low_len * 2; i++) { + scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i])); + } + if (low_len != high_len) { + scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]); + scratch[low_len * 2 + 1] = + vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]); + } + + // Add the middle product into the output. + for (size_t i = 0; i < high_len * 2; i++) { + out[low_len + i] = vec_add(out[low_len + i], scratch[i]); + } +} + +// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1). +static void poly_mul_vec(struct poly *out, const struct poly *x, + const struct poly *y) { + OPENSSL_memset((uint16_t *)&x->v[N], 0, 3 * sizeof(uint16_t)); + OPENSSL_memset((uint16_t *)&y->v[N], 0, 3 * sizeof(uint16_t)); + + OPENSSL_STATIC_ASSERT(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY, + "struct poly is the wrong size"); + OPENSSL_STATIC_ASSERT(alignof(struct poly) == alignof(vec_t), + "struct poly has incorrect alignment"); + + vec_t prod[VECS_PER_POLY * 2]; + vec_t scratch[172]; + poly_mul_vec_aux(prod, scratch, x->vectors, y->vectors, VECS_PER_POLY); + + // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the + // upper-half to the lower-half. However, N is 701, which isn't a multiple of + // the vector size, so the upper-half vectors all have to be shifted before + // being added to the lower-half. + vec_t *out_vecs = (vec_t *)out->v; + + for (size_t i = 0; i < VECS_PER_POLY; i++) { + const vec_t prev = prod[VECS_PER_POLY - 1 + i]; + const vec_t this = prod[VECS_PER_POLY + i]; + out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this)); + } + + OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); +} + +#endif // HRSS_HAVE_VECTOR_UNIT + +// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using +// |scratch| as scratch space. It'll use Karatsuba if the inputs are large +// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and +// the function recurses, except if |n| < 64, when |scratch| isn't used and the +// recursion stops. If |n| == |N| then |scratch| needs 1318 elements. +static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, + const uint16_t *a, const uint16_t *b, size_t n) { + static const size_t kSchoolbookLimit = 64; + if (n < kSchoolbookLimit) { + OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + out[i + j] += (unsigned) a[i] * b[j]; + } + } + + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The + // first is always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const uint16_t *const a_high = &a[low_len]; + const uint16_t *const b_high = &b[low_len]; + + for (size_t i = 0; i < low_len; i++) { + out[i] = a_high[i] + a[i]; + out[high_len + i] = b_high[i] + b[i]; + } + if (high_len != low_len) { + out[low_len] = a_high[low_len]; + out[high_len + low_len] = b_high[low_len]; + } + + uint16_t *const child_scratch = &scratch[2 * high_len]; + poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len); + poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high, + high_len); + poly_mul_novec_aux(out, child_scratch, a, b, low_len); + + for (size_t i = 0; i < low_len * 2; i++) { + scratch[i] -= out[i] + out[low_len * 2 + i]; + } + if (low_len != high_len) { + scratch[low_len * 2] -= out[low_len * 4]; + assert(out[low_len * 4 + 1] == 0); + } + + for (size_t i = 0; i < high_len * 2; i++) { + out[low_len + i] += scratch[i]; + } +} + +// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1). +static void poly_mul_novec(struct poly *out, const struct poly *x, + const struct poly *y) { + uint16_t prod[2 * N]; + uint16_t scratch[1318]; + poly_mul_novec_aux(prod, scratch, x->v, y->v, N); + + for (size_t i = 0; i < N; i++) { + out->v[i] = prod[i] + prod[i + N]; + } + OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); +} + +// On x86-64, we can use the AVX2 code from [HRSS]. (The authors have given +// explicit permission for this and signed a CLA.) However it's 57KB of object +// code, so it's not used if |OPENSSL_SMALL| is defined. +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ + defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX) +// poly_Rq_mul is defined in assembly. +extern void poly_Rq_mul(struct poly *r, const struct poly *a, + const struct poly *b); +#endif + +static void poly_mul(struct poly *r, const struct poly *a, + const struct poly *b) { +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ + defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX) + const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0; + if (has_avx2) { + poly_Rq_mul(r, a, b); + return; + } +#endif + +#if defined(HRSS_HAVE_VECTOR_UNIT) + if (vec_capable()) { + poly_mul_vec(r, a, b); + return; + } +#endif + + // Fallback, non-vector case. + poly_mul_novec(r, a, b); +} + +// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1). +static void poly_mul_x_minus_1(struct poly *p) { + // Multiplying by (𝑥 - 1) means negating each coefficient and adding in + // the value of the previous one. + const uint16_t orig_final_coefficient = p->v[N - 1]; + + for (size_t i = N - 1; i > 0; i--) { + p->v[i] = p->v[i - 1] - p->v[i]; + } + p->v[0] = orig_final_coefficient - p->v[0]; +} + +// poly_mod_phiN sets |p| to |p| mod Φ(N). +static void poly_mod_phiN(struct poly *p) { + const uint16_t coeff700 = p->v[N - 1]; + + for (unsigned i = 0; i < N; i++) { + p->v[i] -= coeff700; + } +} + +// poly_clamp reduces each coefficient mod Q. +static void poly_clamp(struct poly *p) { + for (unsigned i = 0; i < N; i++) { + p->v[i] &= Q - 1; + } +} + + +// Conversion functions +// -------------------- + +// poly2_from_poly sets |*out| to |in| mod 2. +static void poly2_from_poly(struct poly2 *out, const struct poly *in) { + crypto_word_t *words = out->v; + unsigned shift = 0; + crypto_word_t word = 0; + + for (unsigned i = 0; i < N; i++) { + word >>= 1; + word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words = word; + words++; + word = 0; + shift = 0; + } + } + + word >>= BITS_PER_WORD - shift; + *words = word; +} + +// mod3 treats |a| is a signed number and returns |a| mod 3. +static uint16_t mod3(int16_t a) { + const int16_t q = ((int32_t)a * 21845) >> 16; + int16_t ret = a - 3 * q; + // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0, + // 1, 2, 0}. + return ret & ((ret & (ret >> 1)) - 1); +} + +// poly3_from_poly sets |*out| to |in|. +static void poly3_from_poly(struct poly3 *out, const struct poly *in) { + crypto_word_t *words_s = out->s.v; + crypto_word_t *words_a = out->a.v; + crypto_word_t s = 0; + crypto_word_t a = 0; + unsigned shift = 0; + + for (unsigned i = 0; i < N; i++) { + // This duplicates the 13th bit upwards to the top of the uint16, + // essentially treating it as a sign bit and converting into a signed int16. + // The signed value is reduced mod 3, yielding {0, 1, 2}. + const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3); + s >>= 1; + s |= (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2); + a >>= 1; + a |= (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words_s = s; + words_s++; + *words_a = a; + words_a++; + s = a = 0; + shift = 0; + } + } + + s >>= BITS_PER_WORD - shift; + a >>= BITS_PER_WORD - shift; + *words_s = s; + *words_a = a; +} + +// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1, +// Q-1}. It returns a mask indicating whether all coefficients were found to be +// in that set. +static crypto_word_t poly3_from_poly_checked(struct poly3 *out, + const struct poly *in) { + crypto_word_t *words_s = out->s.v; + crypto_word_t *words_a = out->a.v; + crypto_word_t s = 0; + crypto_word_t a = 0; + unsigned shift = 0; + crypto_word_t ok = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < N; i++) { + const uint16_t v = in->v[i]; + // Maps {0, 1, Q-1} to {0, 1, 2}. + uint16_t mod3 = v & 3; + mod3 ^= mod3 >> 1; + const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q; + ok &= constant_time_eq_w(v, expected); + + s >>= 1; + s |= (crypto_word_t)(mod3 & 2) << (BITS_PER_WORD - 2); + a >>= 1; + a |= (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words_s = s; + words_s++; + *words_a = a; + words_a++; + s = a = 0; + shift = 0; + } + } + + s >>= BITS_PER_WORD - shift; + a >>= BITS_PER_WORD - shift; + *words_s = s; + *words_a = a; + + return ok; +} + +static void poly_from_poly2(struct poly *out, const struct poly2 *in) { + const crypto_word_t *words = in->v; + unsigned shift = 0; + crypto_word_t word = *words; + + for (unsigned i = 0; i < N; i++) { + out->v[i] = word & 1; + word >>= 1; + shift++; + + if (shift == BITS_PER_WORD) { + words++; + word = *words; + shift = 0; + } + } +} + +static void poly_from_poly3(struct poly *out, const struct poly3 *in) { + const crypto_word_t *words_s = in->s.v; + const crypto_word_t *words_a = in->a.v; + crypto_word_t word_s = ~(*words_s); + crypto_word_t word_a = *words_a; + unsigned shift = 0; + + for (unsigned i = 0; i < N; i++) { + out->v[i] = (uint16_t)(word_s & 1) - 1; + out->v[i] |= word_a & 1; + word_s >>= 1; + word_a >>= 1; + shift++; + + if (shift == BITS_PER_WORD) { + words_s++; + words_a++; + word_s = ~(*words_s); + word_a = *words_a; + shift = 0; + } + } +} + +// Polynomial inversion +// -------------------- + +// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod +// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion +// mod Q. +static void poly_invert_mod2(struct poly *out, const struct poly *in) { + // This algorithm follows algorithm 10 in the paper. (Although, in contrast to + // the paper, k should start at zero, not one, and the rotation count is needs + // to handle trailing zero coefficients.) The best explanation for why it + // works is in the "Why it works" section of [NTRUTN14]. + + struct poly2 b, c, f, g; + poly2_from_poly(&f, in); + OPENSSL_memset(&b, 0, sizeof(b)); + b.v[0] = 1; + OPENSSL_memset(&c, 0, sizeof(c)); + + // Set g to all ones. + OPENSSL_memset(&g, 0xff, sizeof(struct poly2)); + g.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; + + crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0; + crypto_word_t still_going = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) { + const crypto_word_t s = still_going & lsb_to_all(f.v[0]); + const crypto_word_t should_swap = s & constant_time_lt_w(deg_f, deg_g); + poly2_cswap(&f, &g, should_swap); + poly2_cswap(&b, &c, should_swap); + const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g); + deg_f ^= deg_sum; + deg_g ^= deg_sum; + assert(deg_g >= 1); + poly2_fmadd(&f, &g, s); + poly2_fmadd(&b, &c, s); + + poly2_rshift1(&f); + poly2_lshift1(&c); + + deg_f--; + const crypto_word_t f0_is_nonzero = lsb_to_all(f.v[0]); + // |f0_is_nonzero| implies |still_going|. + assert(!(f0_is_nonzero && !still_going)); + rotation = constant_time_select_w(f0_is_nonzero, i, rotation); + still_going &= ~constant_time_is_zero_w(deg_f); + } + + rotation++; + rotation -= N & constant_time_lt_w(N, rotation); + assert(poly2_top_bits_are_clear(&b)); + HRSS_poly2_rotr_consttime(&b, rotation); + poly_from_poly2(out, &b); +} + +// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)). +static void poly_invert(struct poly *out, const struct poly *in) { + // Inversion mod Q, which is done based on the result of inverting mod + // 2. See [NTRUTN14] paper, bottom of page two. + struct poly a, *b, tmp; + + // a = -in. + for (unsigned i = 0; i < N; i++) { + a.v[i] = -in->v[i]; + } + + // b = in^-1 mod 2. + b = out; + poly_invert_mod2(b, in); + + // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) + // times, which is four. + for (unsigned i = 0; i < 4; i++) { + poly_mul(&tmp, &a, b); + tmp.v[0] += 2; + poly_mul(b, b, &tmp); + } +} + +// Marshal and unmarshal functions for various basic types. +// -------------------------------------------------------- + +#define POLY_BYTES 1138 + +static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) { + const uint16_t *p = in->v; + + for (size_t i = 0; i < N / 8; i++) { + out[0] = p[0]; + out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); + out[2] = p[1] >> 3; + out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); + out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); + out[5] = p[3] >> 1; + out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4); + out[7] = p[4] >> 4; + out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1); + out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6); + out[10] = p[6] >> 2; + out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3); + out[12] = p[7] >> 5; + + p += 8; + out += 13; + } + + // There are four remaining values. + out[0] = p[0]; + out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); + out[2] = p[1] >> 3; + out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); + out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); + out[5] = p[3] >> 1; + out[6] = 0xf & (p[3] >> 9); +} + +static void poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) { + uint16_t *p = out->v; + + for (size_t i = 0; i < N / 8; i++) { + p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; + p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | + (uint16_t)(in[3] & 3) << 11; + p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; + p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | + (uint16_t)(in[6] & 0xf) << 9; + p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 | + (uint16_t)(in[8] & 1) << 12; + p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7; + p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 | + (uint16_t)(in[11] & 7) << 10; + p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5; + + p += 8; + in += 13; + } + + // There are four coefficients remaining. + p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; + p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | + (uint16_t)(in[3] & 3) << 11; + p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; + p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | + (uint16_t)(in[6] & 0xf) << 9; + + for (unsigned i = 0; i < N - 1; i++) { + out->v[i] = (int16_t)(out->v[i] << 3) >> 3; + } + + // There are four unused bits at the top of the final byte. They are always + // marshaled as zero by this code but we allow them to take any value when + // parsing in order to support future extension. + + // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6. + uint32_t sum = 0; + for (size_t i = 0; i < N - 1; i++) { + sum += out->v[i]; + } + + out->v[N - 1] = (uint16_t)(0u - sum); +} + +// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may +// have an invalid value when processing attacker-controlled inputs. +static uint16_t mod3_from_modQ(uint16_t v) { + v &= 3; + return v ^ (v >> 1); +} + +// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are +// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may +// have invalid values when processing attacker-controlled inputs.) +static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], + const struct poly *in) { + const uint16_t *coeffs = in->v; + + // Only 700 coefficients are marshaled because in[700] must be zero. + assert(coeffs[N-1] == 0); + + for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) { + const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]); + const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]); + const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]); + const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]); + const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]); + out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81; + coeffs += 5; + } +} + +// HRSS-specific functions +// ----------------------- + +// poly_short_sample implements the sampling algorithm given in [HRSSNIST] +// section 1.8.1. The output coefficients are in {0, 1, 0xffff} which makes some +// later computation easier. +static void poly_short_sample(struct poly *out, + const uint8_t in[HRSS_SAMPLE_BYTES]) { + // We wish to calculate the difference (mod 3) between two, two-bit numbers. + // Here is a table of results for a - b. Negative one is written as 0b11 so + // that a couple of shifts can be used to sign-extend it. Any input value of + // 0b11 is invalid and a convention is adopted that an invalid input results + // in an invalid output (0b10). + // + // b a result + // 00 00 00 + // 00 01 01 + // 00 10 11 + // 00 11 10 + // 01 00 11 + // 01 01 00 + // 01 10 01 + // 01 11 10 + // 10 00 01 + // 10 01 11 + // 10 10 00 + // 10 11 10 + // 11 00 10 + // 11 01 10 + // 11 10 10 + // 11 11 10 + // + // The result column is encoded in a single-word lookup-table: + // 0001 1110 1100 0110 0111 0010 1010 1010 + // 1 d c 6 7 2 a a + static const uint32_t kLookup = 0x1dc672aa; + + // In order to generate pairs of numbers mod 3 (non-uniformly) we treat pairs + // of bits in a uint32 as separate values and sum two random vectors of 1-bit + // numbers. This works because these pairs are isolated because no carry can + // spread between them. + + uint16_t *p = out->v; + for (size_t i = 0; i < N / 8; i++) { + uint32_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); + in += sizeof(v); + + uint32_t sums = (v & 0x55555555) + ((v >> 1) & 0x55555555); + for (unsigned j = 0; j < 8; j++) { + p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30; + sums >>= 4; + } + p += 8; + } + + // There are four values remaining. + uint16_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); + + uint16_t sums = (v & 0x5555) + ((v >> 1) & 0x5555); + for (unsigned j = 0; j < 4; j++) { + p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30; + sums >>= 4; + } + + out->v[N - 1] = 0; +} + +// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST], +// section 1.8.2. +static void poly_short_sample_plus(struct poly *out, + const uint8_t in[HRSS_SAMPLE_BYTES]) { + poly_short_sample(out, in); + + // sum (and the product in the for loop) will overflow. But that's fine + // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out. + uint16_t sum = 0; + for (unsigned i = 0; i < N - 2; i++) { + sum += (unsigned) out->v[i] * out->v[i + 1]; + } + + // If the sum is negative, flip the sign of even-positioned coefficients. (See + // page 8 of [HRSS].) + sum = ((int16_t) sum) >> 15; + const uint16_t scale = sum | (~sum & 1); + for (unsigned i = 0; i < N; i += 2) { + out->v[i] = (unsigned) out->v[i] * scale; + } +} + +// poly_lift computes the function discussed in [HRSS], appendix B. +static void poly_lift(struct poly *out, const struct poly *a) { + // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the + // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). + + // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: + // + // R.<x> = PolynomialRing(GF(3)…) + // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) + // list(inv)[:15] + // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] + // + // This three-element pattern of coefficients repeats for the whole + // polynomial. + // + // Next define the overbar operator such that z̅ = z[0] + + // reverse(z[1:]). (Index zero of a polynomial here is the coefficient + // of the constant term. So index one is the coefficient of 𝑥 and so + // on.) + // + // A less odd way to define this is to see that z̅ negates the indexes, + // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. + // + // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v, + // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum + // of the point-wise products.) Although we calculated the inverse mod + // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. + // (That's because (𝑥^N - 1) is a multiple of Φ(N).) + // + // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation + // of the list of coefficients. + // + // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: + // + // def reverse(xs): + // suffix = list(xs[1:]) + // suffix.reverse() + // return [xs[0]] + suffix + // + // def rotate(xs): + // return [xs[-1]] + xs[:-1] + // + // zoverbar = reverse(list(inv) + [0]) + // xzoverbar = rotate(reverse(list(inv) + [0])) + // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) + // + // zoverbar[:15] + // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] + // xzoverbar[:15] + // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] + // x2zoverbar[:15] + // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + // + // (For a formula for z̅, see lemma two of appendix B.) + // + // After the first three elements have been taken care of, all then have + // a repeating three-element cycle. The next value (𝑥^3z̅) involves + // three rotations of the first pattern, thus the three-element cycle + // lines up. However, the discontinuity in the first three elements + // obviously moves to a different position. Consider the difference + // between 𝑥^3z̅ and z̅: + // + // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] + // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // + // This pattern of differences is the same for all elements, although it + // obviously moves right with the rotations. + // + // From this, we reach algorithm eight of appendix B. + + // Handle the first three elements of the inner products. + out->v[0] = a->v[0] + a->v[2]; + out->v[1] = a->v[1]; + out->v[2] = -a->v[0] + a->v[2]; + + // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2], + // respectively. We do not compute s1 because it's just -(s0 + s1). + uint16_t s0 = 0, s2 = 0; + for (size_t i = 3; i < 699; i += 3) { + s0 += -a->v[i] + a->v[i + 2]; + // s1 += a->v[i] - a->v[i + 1]; + s2 += a->v[i + 1] - a->v[i + 2]; + } + + // Handle the fact that the three-element pattern doesn't fill the + // polynomial exactly (since 701 isn't a multiple of three). + s0 -= a->v[699]; + // s1 += a->v[699] - a->v[700]; + s2 += a->v[700]; + + // Note that s0 + s1 + s2 = 0. + out->v[0] += s0; + out->v[1] -= (s0 + s2); // = s1 + out->v[2] += s2; + + // Calculate the remaining inner products by taking advantage of the + // fact that the pattern repeats every three cycles and the pattern of + // differences moves with the rotation. + for (size_t i = 3; i < N; i++) { + out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i])); + } + + // Reduce mod Φ(N) by subtracting a multiple of out[700] from every + // element and convert to mod Q. (See above about adding twice as + // subtraction.) + const crypto_word_t v = out->v[700]; + for (unsigned i = 0; i < N; i++) { + const uint16_t vi_mod3 = mod3(out->v[i] - v); + // Map {0, 1, 2} to {0, 1, 0xffff}. + out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3; + } + + poly_mul_x_minus_1(out); +} + +struct public_key { + struct poly ph; +}; + +struct private_key { + struct poly3 f, f_inverse; + struct poly ph_inverse; + uint8_t hmac_key[32]; +}; + +// public_key_from_external converts an external public key pointer into an +// internal one. Externally the alignment is only specified to be eight bytes +// but we need 16-byte alignment. We could annotate the external struct with +// that alignment but we can only assume that malloced pointers are 8-byte +// aligned in any case. (Even if the underlying malloc returns values with +// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess +// that up.) +static struct public_key *public_key_from_external( + struct HRSS_public_key *ext) { + OPENSSL_STATIC_ASSERT( + sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15, + "HRSS public key too small"); + + uintptr_t p = (uintptr_t)ext; + p = (p + 15) & ~15; + return (struct public_key *)p; +} + +// private_key_from_external does the same thing as |public_key_from_external|, +// but for private keys. See the comment on that function about alignment +// issues. +static struct private_key *private_key_from_external( + struct HRSS_private_key *ext) { + OPENSSL_STATIC_ASSERT( + sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15, + "HRSS private key too small"); + + uintptr_t p = (uintptr_t)ext; + p = (p + 15) & ~15; + return (struct private_key *)p; +} + +void HRSS_generate_key( + struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, + const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) { + struct public_key *pub = public_key_from_external(out_pub); + struct private_key *priv = private_key_from_external(out_priv); + + OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES, + sizeof(priv->hmac_key)); + + struct poly f; + poly_short_sample_plus(&f, in); + poly3_from_poly(&priv->f, &f); + HRSS_poly3_invert(&priv->f_inverse, &priv->f); + + // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1). + struct poly pg_phi1; + poly_short_sample_plus(&pg_phi1, in + HRSS_SAMPLE_BYTES); + for (unsigned i = 0; i < N; i++) { + pg_phi1.v[i] *= 3; + } + poly_mul_x_minus_1(&pg_phi1); + + struct poly pfg_phi1; + poly_mul(&pfg_phi1, &f, &pg_phi1); + + struct poly pfg_phi1_inverse; + poly_invert(&pfg_phi1_inverse, &pfg_phi1); + + poly_mul(&pub->ph, &pfg_phi1_inverse, &pg_phi1); + poly_mul(&pub->ph, &pub->ph, &pg_phi1); + poly_clamp(&pub->ph); + + poly_mul(&priv->ph_inverse, &pfg_phi1_inverse, &f); + poly_mul(&priv->ph_inverse, &priv->ph_inverse, &f); + poly_clamp(&priv->ph_inverse); +} + +static void owf(uint8_t out[POLY_BYTES], const struct public_key *pub, + const struct poly *m_lifted, const struct poly *r) { + struct poly prh_plus_m; + poly_mul(&prh_plus_m, r, &pub->ph); + for (unsigned i = 0; i < N; i++) { + prh_plus_m.v[i] += m_lifted->v[i]; + } + + poly_marshal(out, &prh_plus_m); +} + +static const char kSharedKey[] = "shared key"; + +void HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], + uint8_t out_shared_key[32], + const struct HRSS_public_key *in_pub, + const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + struct poly m, r, m_lifted; + poly_short_sample(&m, in); + poly_short_sample(&r, in + HRSS_SAMPLE_BYTES); + poly_lift(&m_lifted, &m); + owf(out_ciphertext, pub, &m_lifted, &r); + + uint8_t m_bytes[HRSS_POLY3_BYTES], r_bytes[HRSS_POLY3_BYTES]; + poly_marshal_mod3(m_bytes, &m); + poly_marshal_mod3(r_bytes, &r); + + SHA256_CTX hash_ctx; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey)); + SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes)); + SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes)); + SHA256_Update(&hash_ctx, out_ciphertext, POLY_BYTES); + SHA256_Final(out_shared_key, &hash_ctx); +} + +void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_public_key *in_pub, + const struct HRSS_private_key *in_priv, + const uint8_t *ciphertext, size_t ciphertext_len) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + const struct private_key *priv = + private_key_from_external((struct HRSS_private_key *)in_priv); + + // This is HMAC, expanded inline rather than using the |HMAC| function so that + // we can avoid dealing with possible allocation failures and so keep this + // function infallible. + uint8_t masked_key[SHA256_CBLOCK]; + OPENSSL_STATIC_ASSERT(sizeof(priv->hmac_key) <= sizeof(masked_key), + "HRSS HMAC key larger than SHA-256 block size"); + for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { + masked_key[i] = priv->hmac_key[i] ^ 0x36; + } + OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x36, + sizeof(masked_key) - sizeof(priv->hmac_key)); + + SHA256_CTX hash_ctx; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key)); + SHA256_Update(&hash_ctx, ciphertext, ciphertext_len); + uint8_t inner_digest[SHA256_DIGEST_LENGTH]; + SHA256_Final(inner_digest, &hash_ctx); + + for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { + masked_key[i] ^= (0x5c ^ 0x36); + } + OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x5c, + sizeof(masked_key) - sizeof(priv->hmac_key)); + + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key)); + SHA256_Update(&hash_ctx, inner_digest, sizeof(inner_digest)); + OPENSSL_STATIC_ASSERT(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH, + "HRSS shared key length incorrect"); + SHA256_Final(out_shared_key, &hash_ctx); + + // If the ciphertext is publicly invalid then a random shared key is still + // returned to simply the logic of the caller, but this path is not constant + // time. + if (ciphertext_len != HRSS_CIPHERTEXT_BYTES) { + return; + } + + struct poly c; + poly_unmarshal(&c, ciphertext); + + struct poly f; + poly_from_poly3(&f, &priv->f); + + struct poly cf; + poly_mul(&cf, &c, &f); + + struct poly3 cf3; + poly3_from_poly(&cf3, &cf); + // Note that cf3 is not reduced mod Φ(N). That reduction is deferred. + + struct poly3 m3; + HRSS_poly3_mul(&m3, &cf3, &priv->f_inverse); + + struct poly m, m_lifted; + poly_from_poly3(&m, &m3); + poly_lift(&m_lifted, &m); + + for (unsigned i = 0; i < N; i++) { + c.v[i] -= m_lifted.v[i]; + } + poly_mul(&c, &c, &priv->ph_inverse); + poly_mod_phiN(&c); + poly_clamp(&c); + + struct poly3 r3; + crypto_word_t ok = poly3_from_poly_checked(&r3, &c); + + uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES]; + OPENSSL_STATIC_ASSERT(HRSS_CIPHERTEXT_BYTES == POLY_BYTES, + "ciphertext is the wrong size"); + assert(ciphertext_len == sizeof(expected_ciphertext)); + owf(expected_ciphertext, pub, &m_lifted, &c); + + uint8_t m_bytes[HRSS_POLY3_BYTES]; + uint8_t r_bytes[HRSS_POLY3_BYTES]; + poly_marshal_mod3(m_bytes, &m); + poly_marshal_mod3(r_bytes, &c); + + ok &= constant_time_is_zero_w(CRYPTO_memcmp(ciphertext, expected_ciphertext, + sizeof(expected_ciphertext))); + + uint8_t shared_key[32]; + SHA256_Init(&hash_ctx); + SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey)); + SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes)); + SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes)); + SHA256_Update(&hash_ctx, expected_ciphertext, sizeof(expected_ciphertext)); + SHA256_Final(shared_key, &hash_ctx); + + for (unsigned i = 0; i < sizeof(shared_key); i++) { + out_shared_key[i] = + constant_time_select_8(ok, shared_key[i], out_shared_key[i]); + } +} + +void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], + const struct HRSS_public_key *in_pub) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + poly_marshal(out, &pub->ph); +} + +int HRSS_parse_public_key(struct HRSS_public_key *out, + const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) { + struct public_key *pub = public_key_from_external(out); + poly_unmarshal(&pub->ph, in); + OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t)); + return 1; +} diff --git a/src/crypto/hrss/hrss_test.cc b/src/crypto/hrss/hrss_test.cc new file mode 100644 index 00000000..ead717d6 --- /dev/null +++ b/src/crypto/hrss/hrss_test.cc @@ -0,0 +1,472 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include <gtest/gtest.h> + +#include <openssl/hrss.h> +#include <openssl/rand.h> + +#include "../test/test_util.h" +#include "internal.h" + +// poly2_from_bits takes the least-significant bit from each byte of |in| and +// sets the bits of |*out| to match. +static void poly2_from_bits(struct poly2 *out, const uint8_t in[N]) { + crypto_word_t *words = out->v; + unsigned shift = 0; + crypto_word_t word = 0; + + for (unsigned i = 0; i < N; i++) { + word >>= 1; + word |= (crypto_word_t)(in[i] & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words = word; + words++; + word = 0; + shift = 0; + } + } + + word >>= BITS_PER_WORD - shift; + *words = word; +} + +TEST(HRSS, Poly2RotateRight) { + uint8_t bits[N]; + RAND_bytes(bits, sizeof(bits)); + for (size_t i = 0; i < N; i++) { + bits[i] &= 1; + }; + + struct poly2 p, orig, shifted; + poly2_from_bits(&p, bits); + OPENSSL_memcpy(&orig, &p, sizeof(orig)); + + // Test |HRSS_poly2_rotr_consttime| by manually rotating |bits| step-by-step + // and testing every possible shift to ensure that it produces the correct + // answer. + for (size_t shift = 0; shift <= N; shift++) { + SCOPED_TRACE(shift); + + OPENSSL_memcpy(&p, &orig, sizeof(orig)); + HRSS_poly2_rotr_consttime(&p, shift); + poly2_from_bits(&shifted, bits); + ASSERT_EQ( + Bytes(reinterpret_cast<const uint8_t *>(&shifted), sizeof(shifted)), + Bytes(reinterpret_cast<const uint8_t *>(&p), sizeof(p))); + + const uint8_t least_significant_bit = bits[0]; + OPENSSL_memmove(bits, &bits[1], N-1); + bits[N-1] = least_significant_bit; + } +} + +// poly3_rand sets |r| to a random value (albeit with bias). +static void poly3_rand(poly3 *p) { + RAND_bytes(reinterpret_cast<uint8_t *>(p), sizeof(poly3)); + p->s.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; + p->a.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; + // (s, a) = (1, 1) is invalid. Map those to one. + for (size_t j = 0; j < WORDS_PER_POLY; j++) { + p->s.v[j] ^= p->s.v[j] & p->a.v[j]; + } +} + +// poly3_word_add sets (|s1|, |a1|) += (|s2|, |a2|). +static void poly3_word_add(crypto_word_t *s1, crypto_word_t *a1, + const crypto_word_t s2, const crypto_word_t a2) { + const crypto_word_t x = *a1 ^ a2; + const crypto_word_t y = (*s1 ^ s2) ^ (*a1 & a2); + const crypto_word_t z = *s1 & s2; + *s1 = y & ~x; + *a1 = z | (x & ~y); +} + +TEST(HRSS, Poly3Invert) { + poly3 p, inverse, result; + memset(&p, 0, sizeof(p)); + memset(&inverse, 0, sizeof(inverse)); + memset(&result, 0, sizeof(result)); + + // The inverse of -1 is -1. + p.s.v[0] = 1; + HRSS_poly3_invert(&inverse, &p); + EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t*>(&p), sizeof(p)), + Bytes(reinterpret_cast<const uint8_t*>(&inverse), sizeof(inverse))); + + // The inverse of 1 is 1. + p.s.v[0] = 0; + p.a.v[0] = 1; + HRSS_poly3_invert(&inverse, &p); + EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t*>(&p), sizeof(p)), + Bytes(reinterpret_cast<const uint8_t*>(&inverse), sizeof(inverse))); + + for (size_t i = 0; i < 500; i++) { + poly3 r; + poly3_rand(&r); + HRSS_poly3_invert(&inverse, &r); + HRSS_poly3_mul(&result, &inverse, &r); + // r×r⁻¹ = 1, and |p| contains 1. + EXPECT_EQ( + Bytes(reinterpret_cast<const uint8_t *>(&p), sizeof(p)), + Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result))); + } +} + +TEST(HRSS, Poly3UnreducedInput) { + // Check that |poly3_mul| works correctly with inputs that aren't reduced mod + // Φ(N). + poly3 r, inverse, result, one; + poly3_rand(&r); + HRSS_poly3_invert(&inverse, &r); + HRSS_poly3_mul(&result, &inverse, &r); + + memset(&one, 0, sizeof(one)); + one.a.v[0] = 1; + EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t *>(&one), sizeof(one)), + Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result))); + + // |r| is probably already not reduced mod Φ(N), but add x^701 - 1 and + // recompute to ensure that we get the same answer. (Since (x^701 - 1) ≡ 0 mod + // Φ(N).) + poly3_word_add(&r.s.v[0], &r.a.v[0], 1, 0); + poly3_word_add(&r.s.v[WORDS_PER_POLY - 1], &r.a.v[WORDS_PER_POLY - 1], 0, + UINT64_C(1) << BITS_IN_LAST_WORD); + + HRSS_poly3_mul(&result, &inverse, &r); + EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t *>(&one), sizeof(one)), + Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result))); + + // Check that x^700 × 1 gives -x^699 - x^698 … -1. + poly3 x700; + memset(&x700, 0, sizeof(x700)); + x700.a.v[WORDS_PER_POLY-1] = UINT64_C(1) << (BITS_IN_LAST_WORD - 1); + HRSS_poly3_mul(&result, &one, &x700); + + for (size_t i = 0; i < WORDS_PER_POLY-1; i++) { + EXPECT_EQ(CONSTTIME_TRUE_W, result.s.v[i]); + EXPECT_EQ(0u, result.a.v[i]); + } + EXPECT_EQ((UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1, + result.s.v[WORDS_PER_POLY - 1]); + EXPECT_EQ(0u, result.a.v[WORDS_PER_POLY - 1]); +} + +TEST(HRSS, Basic) { + uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES]; + for (unsigned i = 0; i < sizeof(generate_key_entropy); i++) { + generate_key_entropy[i] = i; + } + + HRSS_public_key pub; + HRSS_private_key priv; + HRSS_generate_key(&pub, &priv, generate_key_entropy); + + uint8_t encap_entropy[HRSS_ENCAP_BYTES]; + for (unsigned i = 0; i < sizeof(encap_entropy); i++) { + encap_entropy[i] = i; + } + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + HRSS_encap(ciphertext, shared_key, &pub, encap_entropy); + + HRSS_public_key pub2; + uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES]; + HRSS_marshal_public_key(pub_bytes, &pub); + ASSERT_TRUE(HRSS_parse_public_key(&pub2, pub_bytes)); + + uint8_t shared_key2[HRSS_KEY_BYTES]; + HRSS_decap(shared_key2, &pub2, &priv, ciphertext, sizeof(ciphertext)); + + EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2)); +} + +TEST(HRSS, Random) { + for (unsigned i = 0; i < 10; i++) { + uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(generate_key_entropy, sizeof(generate_key_entropy)); + SCOPED_TRACE(Bytes(generate_key_entropy)); + + HRSS_public_key pub; + HRSS_private_key priv; + HRSS_generate_key(&pub, &priv, generate_key_entropy); + + for (unsigned j = 0; j < 10; j++) { + uint8_t encap_entropy[HRSS_ENCAP_BYTES]; + RAND_bytes(encap_entropy, sizeof(encap_entropy)); + SCOPED_TRACE(Bytes(generate_key_entropy)); + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + HRSS_encap(ciphertext, shared_key, &pub, encap_entropy); + + uint8_t shared_key2[HRSS_KEY_BYTES]; + HRSS_decap(shared_key2, &pub, &priv, ciphertext, sizeof(ciphertext)); + + EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2)); + } + } +} + +TEST(HRSS, Golden) { + uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES]; + for (unsigned i = 0; i < HRSS_SAMPLE_BYTES; i++) { + generate_key_entropy[i] = i; + } + for (unsigned i = HRSS_SAMPLE_BYTES; i < 2 * HRSS_SAMPLE_BYTES; i++) { + generate_key_entropy[i] = 2 + i; + } + for (unsigned i = 2 * HRSS_SAMPLE_BYTES; i < sizeof(generate_key_entropy); + i++) { + generate_key_entropy[i] = 4 + i; + } + + HRSS_public_key pub; + HRSS_private_key priv; + OPENSSL_memset(&pub, 0, sizeof(pub)); + OPENSSL_memset(&priv, 0, sizeof(priv)); + HRSS_generate_key(&pub, &priv, generate_key_entropy); + + static const uint8_t kExpectedPub[HRSS_PUBLIC_KEY_BYTES] = { + 0xf8, 0x9f, 0xa0, 0xfc, 0xf1, 0xd4, 0xfa, 0x4d, 0x8f, 0x35, 0x28, 0x73, + 0x0e, 0x37, 0x18, 0x1d, 0x09, 0xf3, 0x9e, 0x16, 0x0d, 0x7f, 0x9c, 0x82, + 0x17, 0xa1, 0xa1, 0x88, 0x6b, 0x29, 0x5b, 0x3a, 0x30, 0xcd, 0x6f, 0x8e, + 0x0c, 0xd3, 0x38, 0x0c, 0x05, 0x68, 0x6e, 0x4c, 0xcc, 0x20, 0xd4, 0x06, + 0x77, 0x0c, 0xac, 0x1c, 0x49, 0x14, 0x00, 0xd6, 0x9b, 0x1c, 0xde, 0x43, + 0x0a, 0x59, 0x37, 0xd6, 0x46, 0x68, 0x1f, 0x04, 0xcb, 0x73, 0x92, 0x37, + 0x2d, 0x7f, 0x57, 0x70, 0x16, 0xe8, 0x06, 0x48, 0x3b, 0x66, 0xb3, 0x63, + 0x02, 0x5a, 0x71, 0x46, 0xdd, 0xa4, 0xee, 0xb8, 0x78, 0x44, 0xfd, 0x9e, + 0xd0, 0x71, 0x16, 0x00, 0xbd, 0x01, 0x1e, 0x27, 0x2e, 0xa0, 0xc6, 0x8d, + 0x55, 0x89, 0x7c, 0x2a, 0x01, 0x2b, 0x1b, 0x75, 0xa2, 0xc2, 0xd1, 0x5a, + 0x67, 0xfa, 0xdd, 0x3b, 0x70, 0x9d, 0xdb, 0xcd, 0x73, 0x32, 0x5e, 0x24, + 0xb1, 0xcf, 0x23, 0xbe, 0x3c, 0x56, 0xcc, 0xbe, 0x61, 0xdb, 0xe7, 0x3c, + 0xc7, 0xf5, 0x09, 0xe6, 0x87, 0xa0, 0x09, 0x52, 0x9d, 0x61, 0x5b, 0xc6, + 0xd4, 0xc5, 0x2e, 0xc2, 0x6c, 0x87, 0x30, 0x36, 0x49, 0x6f, 0x04, 0xaa, + 0xb3, 0x26, 0xd5, 0x63, 0xcf, 0xd4, 0x74, 0x1e, 0xc7, 0x79, 0xb3, 0xfc, + 0x8c, 0x41, 0x36, 0x79, 0xaa, 0xd5, 0xba, 0x64, 0x49, 0x48, 0xdb, 0xeb, + 0xe8, 0x33, 0x7d, 0xbe, 0x3b, 0x67, 0xd7, 0xfd, 0x93, 0x1e, 0x80, 0x8d, + 0x17, 0xab, 0x6f, 0xfd, 0x1c, 0x4b, 0x2d, 0x5b, 0x90, 0xf0, 0xf0, 0x5d, + 0xbe, 0x8f, 0x81, 0x18, 0x29, 0x08, 0x9a, 0x47, 0x1b, 0xc2, 0x2d, 0xa2, + 0x22, 0x5a, 0x4f, 0xe9, 0x81, 0x64, 0xdd, 0x53, 0x2e, 0x67, 0xe5, 0x07, + 0x1a, 0xf0, 0x0c, 0x54, 0x9b, 0xe2, 0xf8, 0xe6, 0xb3, 0xb6, 0xe0, 0x5a, + 0x74, 0xfa, 0x8d, 0x9c, 0xa5, 0x7c, 0x6e, 0x73, 0xba, 0xee, 0x6e, 0x6e, + 0x31, 0xcb, 0x59, 0xd7, 0xfd, 0x94, 0x1c, 0x4d, 0x62, 0xc6, 0x87, 0x0b, + 0x38, 0x54, 0xc6, 0x35, 0xac, 0xc8, 0x8c, 0xc0, 0xd9, 0x99, 0xee, 0xfc, + 0xa9, 0xde, 0xc4, 0x50, 0x88, 0x8e, 0x24, 0xf6, 0xd6, 0x04, 0x54, 0x3e, + 0x81, 0xc4, 0x96, 0x9a, 0x40, 0xe5, 0xef, 0x8b, 0xec, 0x41, 0x50, 0x1d, + 0x14, 0xae, 0xa4, 0x5a, 0xac, 0xd4, 0x73, 0x31, 0xc3, 0x1d, 0xc1, 0x96, + 0x89, 0xd8, 0x62, 0x97, 0x60, 0x3f, 0x58, 0x2a, 0x5f, 0xcf, 0xcb, 0x26, + 0x99, 0x69, 0x81, 0x13, 0x9c, 0xaf, 0x17, 0x91, 0xa8, 0xeb, 0x9a, 0xf9, + 0xd3, 0x83, 0x47, 0x66, 0xc7, 0xf8, 0xd8, 0xe3, 0xd2, 0x7e, 0x58, 0xa9, + 0xf5, 0xb2, 0x03, 0xbe, 0x7e, 0xa5, 0x29, 0x9d, 0xff, 0xd1, 0xd8, 0x55, + 0x39, 0xc7, 0x2c, 0xce, 0x03, 0x64, 0xdc, 0x18, 0xe7, 0xb0, 0x60, 0x46, + 0x26, 0xeb, 0xb7, 0x61, 0x4b, 0x91, 0x2c, 0xd8, 0xa2, 0xee, 0x63, 0x2e, + 0x15, 0x0a, 0x58, 0x88, 0x04, 0xb1, 0xed, 0x6d, 0xf1, 0x5c, 0xc7, 0xee, + 0x60, 0x38, 0x26, 0xc9, 0x31, 0x7e, 0x69, 0xe4, 0xac, 0x3c, 0x72, 0x09, + 0x3e, 0xe6, 0x24, 0x30, 0x44, 0x6e, 0x66, 0x83, 0xb9, 0x2a, 0x22, 0xaf, + 0x26, 0x1e, 0xaa, 0xa3, 0xf4, 0xb1, 0xa1, 0x5c, 0xfa, 0x5f, 0x0d, 0x71, + 0xac, 0xe3, 0xe0, 0xc3, 0xdd, 0x4f, 0x96, 0x57, 0x8b, 0x58, 0xac, 0xe3, + 0x42, 0x8e, 0x47, 0x72, 0xb1, 0xe4, 0x19, 0x68, 0x3e, 0xbb, 0x19, 0x14, + 0xdf, 0x16, 0xb5, 0xde, 0x7f, 0x37, 0xaf, 0xd8, 0xd3, 0x3d, 0x6a, 0x16, + 0x1b, 0x26, 0xd3, 0xcc, 0x53, 0x82, 0x57, 0x90, 0x89, 0xc5, 0x7e, 0x6d, + 0x7e, 0x99, 0x5b, 0xcd, 0xd3, 0x18, 0xbb, 0x89, 0xef, 0x76, 0xbd, 0xd2, + 0x62, 0xf0, 0xe8, 0x25, 0x2a, 0x8d, 0xe2, 0x21, 0xea, 0xde, 0x6e, 0xa5, + 0xa4, 0x3d, 0x58, 0xee, 0xdf, 0x90, 0xc1, 0xa1, 0x38, 0x5d, 0x11, 0x50, + 0xb5, 0xac, 0x9d, 0xb4, 0xfd, 0xef, 0x53, 0xe8, 0xc0, 0x17, 0x6c, 0x4f, + 0x31, 0xe0, 0xcc, 0x8f, 0x80, 0x7a, 0x84, 0x14, 0xde, 0xee, 0xec, 0xdd, + 0x6a, 0xad, 0x29, 0x65, 0xa5, 0x72, 0xc3, 0x73, 0x5f, 0xe3, 0x6f, 0x60, + 0xb1, 0xfb, 0x0f, 0xaa, 0xc6, 0xda, 0x53, 0x4a, 0xb1, 0x92, 0x2a, 0xb7, + 0x02, 0xbe, 0xf9, 0xdf, 0x37, 0x16, 0xe7, 0x5c, 0x38, 0x0b, 0x3c, 0xe2, + 0xdd, 0x90, 0xb8, 0x7b, 0x48, 0x69, 0x79, 0x81, 0xc5, 0xae, 0x9a, 0x0d, + 0x78, 0x95, 0x52, 0x63, 0x80, 0xda, 0x46, 0x69, 0x20, 0x57, 0x9b, 0x27, + 0xe2, 0xe8, 0xbd, 0x2f, 0x45, 0xe6, 0x46, 0x40, 0xae, 0x50, 0xd5, 0xa2, + 0x53, 0x93, 0xe1, 0x99, 0xfd, 0x13, 0x7c, 0xf6, 0x22, 0xc4, 0x6c, 0xab, + 0xe3, 0xc9, 0x55, 0x0a, 0x16, 0x67, 0x68, 0x26, 0x6b, 0xd6, 0x7d, 0xde, + 0xd3, 0xae, 0x71, 0x32, 0x02, 0xf1, 0x27, 0x67, 0x47, 0x74, 0xd9, 0x40, + 0x35, 0x1d, 0x25, 0x72, 0x32, 0xdf, 0x75, 0xd5, 0x60, 0x26, 0xab, 0x90, + 0xfa, 0xeb, 0x26, 0x11, 0x4b, 0xb4, 0xc5, 0xc2, 0x3e, 0xa9, 0x23, 0x3a, + 0x4e, 0x6a, 0xb1, 0xbb, 0xb3, 0xea, 0xf9, 0x1e, 0xe4, 0x10, 0xf5, 0xdc, + 0x35, 0xde, 0xb5, 0xee, 0xf0, 0xde, 0xa1, 0x18, 0x80, 0xc7, 0x13, 0x68, + 0x46, 0x94, 0x0e, 0x2a, 0x8e, 0xf8, 0xe9, 0x26, 0x84, 0x42, 0x0f, 0x56, + 0xed, 0x67, 0x7f, 0xeb, 0x7d, 0x35, 0x07, 0x01, 0x11, 0x81, 0x8b, 0x56, + 0x88, 0xc6, 0x58, 0x61, 0x65, 0x3c, 0x5d, 0x9c, 0x58, 0x25, 0xd6, 0xdf, + 0x4e, 0x3b, 0x93, 0xbf, 0x82, 0xe1, 0x19, 0xb8, 0xda, 0xde, 0x26, 0x38, + 0xf2, 0xd9, 0x95, 0x24, 0x98, 0xde, 0x58, 0xf7, 0x0c, 0xe9, 0x32, 0xbb, + 0xcc, 0xf7, 0x92, 0x69, 0xa2, 0xf0, 0xc3, 0xfa, 0xd2, 0x31, 0x8b, 0x43, + 0x4e, 0x03, 0xe2, 0x13, 0x79, 0x6e, 0x73, 0x63, 0x3b, 0x45, 0xde, 0x80, + 0xf4, 0x26, 0xb1, 0x38, 0xed, 0x62, 0x55, 0xc6, 0x6a, 0x67, 0x00, 0x2d, + 0xba, 0xb2, 0xc5, 0xb6, 0x97, 0x62, 0x28, 0x64, 0x30, 0xb9, 0xfb, 0x3f, + 0x94, 0x03, 0x48, 0x36, 0x2c, 0x5d, 0xfd, 0x08, 0x96, 0x40, 0xd1, 0x6c, + 0xe5, 0xd0, 0xf8, 0x99, 0x40, 0x82, 0x87, 0xd7, 0xdc, 0x2f, 0x8b, 0xaa, + 0x31, 0x96, 0x0a, 0x34, 0x33, 0xa6, 0xf1, 0x84, 0x6e, 0x33, 0x73, 0xc5, + 0xe3, 0x26, 0xad, 0xd0, 0xcb, 0x62, 0x71, 0x82, 0xab, 0xd1, 0x82, 0x33, + 0xe6, 0xca, 0xd0, 0x3e, 0xf5, 0x4d, 0x12, 0x6e, 0xf1, 0x83, 0xbd, 0xdc, + 0x4d, 0xdf, 0x49, 0xbc, 0x63, 0xae, 0x7e, 0x59, 0xe8, 0x3c, 0x0d, 0xd6, + 0x1d, 0x41, 0x89, 0x72, 0x52, 0xc0, 0xae, 0xd1, 0x2f, 0x0a, 0x8a, 0xce, + 0x26, 0xd0, 0x3e, 0x0c, 0x71, 0x32, 0x52, 0xb2, 0xe4, 0xee, 0xa2, 0xe5, + 0x28, 0xb6, 0x33, 0x69, 0x97, 0x5a, 0x53, 0xdb, 0x56, 0x63, 0xe9, 0xb3, + 0x6d, 0x60, 0xf4, 0x7a, 0xce, 0xec, 0x36, 0x65, 0xd5, 0xca, 0x63, 0x2a, + 0x19, 0x90, 0x14, 0x7b, 0x02, 0x33, 0xfa, 0x11, 0x58, 0x5a, 0xd9, 0xc5, + 0x54, 0xf3, 0x28, 0xd5, 0x6e, 0xea, 0x85, 0xf5, 0x09, 0xbb, 0x81, 0x44, + 0x1c, 0x63, 0x66, 0x81, 0xc5, 0x96, 0x2d, 0x7c, 0x0e, 0x75, 0x7b, 0xb4, + 0x7e, 0x4e, 0x0c, 0xfd, 0x3c, 0xc5, 0x5a, 0x22, 0x85, 0x5c, 0xc8, 0xf3, + 0x97, 0x98, 0x2c, 0xe9, 0x46, 0xb4, 0x02, 0xcf, 0x7d, 0xa4, 0xf2, 0x44, + 0x7a, 0x89, 0x71, 0xa0, 0xfa, 0xb6, 0xa3, 0xaf, 0x13, 0x25, 0x46, 0xe2, + 0x64, 0xe3, 0x69, 0xba, 0xf9, 0x68, 0x5c, 0xc0, 0xb7, 0xa8, 0xa6, 0x4b, + 0xe1, 0x42, 0xe9, 0xb5, 0xc7, 0x84, 0xbb, 0xa6, 0x4b, 0x10, 0x4e, 0xd4, + 0x68, 0x70, 0x0a, 0x75, 0x2a, 0xbb, 0x9d, 0xa0, 0xcb, 0xf0, 0x36, 0x4c, + 0x70, 0x6c, 0x60, 0x4d, 0xfe, 0xe8, 0xc8, 0x66, 0x80, 0x1b, 0xf7, 0xcc, + 0x1a, 0xdd, 0x6b, 0xa7, 0xa7, 0x25, 0x61, 0x0c, 0x31, 0xf0, 0x34, 0x63, + 0x00, 0x0e, 0x48, 0x6a, 0x5a, 0x8d, 0x47, 0x94, 0x3f, 0x14, 0x16, 0xa8, + 0x8a, 0x49, 0xbb, 0x0c, 0x43, 0x21, 0xda, 0xf2, 0xc5, 0xd0, 0xff, 0x19, + 0x3e, 0x36, 0x64, 0x20, 0xb3, 0x70, 0xae, 0x54, 0xca, 0x73, 0x05, 0x56, + 0x7a, 0x49, 0x45, 0xe9, 0x46, 0xbc, 0xc2, 0x61, 0x70, 0x40, 0x7c, 0xb0, + 0xf7, 0xea, 0xc0, 0xd1, 0xb0, 0x77, 0x2c, 0xc7, 0xdd, 0x88, 0xcb, 0x9d, + 0xea, 0x55, 0x6c, 0x5c, 0x28, 0xb8, 0x84, 0x1c, 0x2c, 0x06, + }; + uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES]; + HRSS_marshal_public_key(pub_bytes, &pub); + EXPECT_EQ(Bytes(pub_bytes), Bytes(kExpectedPub)); + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + OPENSSL_STATIC_ASSERT( + sizeof(kExpectedPub) >= HRSS_ENCAP_BYTES, + "Private key too small to use as input to HRSS encapsulation"); + HRSS_encap(ciphertext, shared_key, &pub, kExpectedPub); + + static const uint8_t kExpectedCiphertext[HRSS_CIPHERTEXT_BYTES] = { + 0x8e, 0x6b, 0x46, 0x9d, 0x4a, 0xef, 0xa6, 0x8c, 0x28, 0x7b, 0xec, 0x6f, + 0x13, 0x2d, 0x7f, 0x6c, 0xca, 0x7d, 0x9e, 0x6b, 0x54, 0x62, 0xa3, 0x13, + 0xe1, 0x1e, 0x8f, 0x5f, 0x71, 0x67, 0xc4, 0x85, 0xdf, 0xd5, 0x6b, 0xbd, + 0x86, 0x0f, 0x98, 0xec, 0xa5, 0x04, 0xf7, 0x7b, 0x2a, 0xbe, 0xcb, 0xac, + 0x29, 0xbe, 0xe1, 0x0f, 0xbc, 0x62, 0x87, 0x85, 0x7f, 0x05, 0xae, 0xe4, + 0x3f, 0x87, 0xfc, 0x1f, 0xf7, 0x45, 0x1e, 0xa3, 0xdb, 0xb1, 0xa0, 0x25, + 0xba, 0x82, 0xec, 0xca, 0x8d, 0xab, 0x7a, 0x20, 0x03, 0xeb, 0xe5, 0x5c, + 0x9f, 0xd0, 0x46, 0x78, 0xf1, 0x5a, 0xc7, 0x9e, 0xb4, 0x10, 0x6d, 0x37, + 0xc0, 0x75, 0x08, 0xfb, 0xeb, 0xcb, 0xd8, 0x35, 0x21, 0x9b, 0x89, 0xa0, + 0xaa, 0x87, 0x00, 0x66, 0x38, 0x37, 0x68, 0xa4, 0xa3, 0x93, 0x8e, 0x2b, + 0xca, 0xf7, 0x7a, 0x43, 0xb2, 0x15, 0x79, 0x81, 0xce, 0xa9, 0x09, 0xcb, + 0x29, 0xd4, 0xcc, 0xef, 0xf1, 0x9b, 0xbd, 0xe6, 0x63, 0xd5, 0x26, 0x0f, + 0xe8, 0x8b, 0xdf, 0xf1, 0xc3, 0xb4, 0x18, 0x0e, 0xf2, 0x1d, 0x5d, 0x82, + 0x9b, 0x1f, 0xf3, 0xca, 0x36, 0x2a, 0x26, 0x0a, 0x7f, 0xc4, 0x0d, 0xbd, + 0x5b, 0x15, 0x1c, 0x18, 0x6c, 0x11, 0x4e, 0xec, 0x36, 0x01, 0xc1, 0x15, + 0xab, 0xf7, 0x0b, 0x1a, 0xd3, 0xa1, 0xbd, 0x68, 0xc8, 0x59, 0xe7, 0x49, + 0x5c, 0xd5, 0x4b, 0x8c, 0x31, 0xdb, 0xb3, 0xea, 0x88, 0x09, 0x2f, 0xb9, + 0x8b, 0xfd, 0x96, 0x35, 0x88, 0x53, 0x72, 0x40, 0xcd, 0x89, 0x75, 0xb4, + 0x20, 0xf6, 0xf6, 0xe5, 0x74, 0x19, 0x48, 0xaf, 0x4b, 0xaa, 0x42, 0xa4, + 0xc8, 0x90, 0xee, 0xf3, 0x12, 0x04, 0x63, 0x90, 0x92, 0x8a, 0x89, 0xc3, + 0xa0, 0x7e, 0xfe, 0x19, 0xb3, 0x54, 0x53, 0x83, 0xe9, 0xc1, 0x6c, 0xe3, + 0x97, 0xa6, 0x27, 0xc3, 0x20, 0x9a, 0x79, 0x35, 0xc9, 0xb5, 0xc0, 0x90, + 0xe1, 0x56, 0x84, 0x69, 0xc2, 0x54, 0x77, 0x52, 0x48, 0x55, 0x71, 0x3e, + 0xcd, 0xa7, 0xd6, 0x25, 0x5d, 0x49, 0x13, 0xd2, 0x59, 0xd7, 0xe1, 0xd1, + 0x70, 0x46, 0xa0, 0xd4, 0xee, 0x59, 0x13, 0x1f, 0x1a, 0xd3, 0x39, 0x7d, + 0xb0, 0x79, 0xf7, 0xc0, 0x73, 0x5e, 0xbb, 0x08, 0xf7, 0x5c, 0xb0, 0x31, + 0x41, 0x3d, 0x7b, 0x1e, 0xf0, 0xe6, 0x47, 0x5c, 0x37, 0xd5, 0x54, 0xf1, + 0xbb, 0x64, 0xd7, 0x41, 0x8b, 0x34, 0x55, 0xaa, 0xc3, 0x5a, 0x9c, 0xa0, + 0xcc, 0x29, 0x8e, 0x5a, 0x1a, 0x93, 0x5a, 0x49, 0xd3, 0xd0, 0xa0, 0x56, + 0xda, 0x32, 0xa2, 0xa9, 0xa7, 0x13, 0x42, 0x93, 0x9b, 0x20, 0x32, 0x37, + 0x5c, 0x3e, 0x03, 0xa5, 0x28, 0x10, 0x93, 0xdd, 0xa0, 0x04, 0x7b, 0x2a, + 0xbd, 0x31, 0xc3, 0x6a, 0x89, 0x58, 0x6e, 0x55, 0x0e, 0xc9, 0x5c, 0x70, + 0x07, 0x10, 0xf1, 0x9a, 0xbd, 0xfb, 0xd2, 0xb7, 0x94, 0x5b, 0x4f, 0x8d, + 0x90, 0xfa, 0xee, 0xae, 0x37, 0x48, 0xc5, 0xf8, 0x16, 0xa1, 0x3b, 0x70, + 0x03, 0x1f, 0x0e, 0xb8, 0xbd, 0x8d, 0x30, 0x4f, 0x95, 0x31, 0x0b, 0x9f, + 0xfc, 0x80, 0xf8, 0xef, 0xa3, 0x3c, 0xbc, 0xe2, 0x23, 0x23, 0x3e, 0x2a, + 0x55, 0x11, 0xe8, 0x2c, 0x17, 0xea, 0x1c, 0xbd, 0x1d, 0x2d, 0x1b, 0xd5, + 0x16, 0x9e, 0x05, 0xfc, 0x89, 0x64, 0x50, 0x4d, 0x9a, 0x22, 0x50, 0xc6, + 0x5a, 0xd9, 0x58, 0x99, 0x8f, 0xbd, 0xf2, 0x4f, 0x2c, 0xdb, 0x51, 0x6a, + 0x86, 0xe2, 0xc6, 0x64, 0x8f, 0x54, 0x1a, 0xf2, 0xcb, 0x34, 0x88, 0x08, + 0xbd, 0x2a, 0x8f, 0xec, 0x29, 0xf5, 0x22, 0x36, 0x83, 0x99, 0xb9, 0x71, + 0x8c, 0x99, 0x5c, 0xec, 0x91, 0x78, 0xc1, 0xe2, 0x2d, 0xe9, 0xd1, 0x4d, + 0xf5, 0x15, 0x93, 0x4d, 0x93, 0x92, 0x9f, 0x0f, 0x33, 0x5e, 0xcd, 0x58, + 0x5f, 0x3d, 0x52, 0xb9, 0x38, 0x6a, 0x85, 0x63, 0x8b, 0x63, 0x29, 0xcb, + 0x67, 0x12, 0x25, 0xc2, 0x44, 0xd7, 0xab, 0x1a, 0x24, 0xca, 0x3d, 0xca, + 0x77, 0xce, 0x28, 0x68, 0x1a, 0x91, 0xed, 0x7b, 0xc9, 0x70, 0x84, 0xab, + 0xe2, 0xd4, 0xf4, 0xac, 0x58, 0xf6, 0x70, 0x99, 0xfc, 0x99, 0x4d, 0xbd, + 0xb4, 0x1b, 0x4f, 0x15, 0x86, 0x95, 0x08, 0xd1, 0x4e, 0x73, 0xa9, 0xbc, + 0x6a, 0x8c, 0xbc, 0xb5, 0x4b, 0xe0, 0xee, 0x35, 0x24, 0xf9, 0x12, 0xf5, + 0x88, 0x70, 0x50, 0x6c, 0xfe, 0x0d, 0x35, 0xbd, 0xf7, 0xc4, 0x2e, 0x39, + 0x16, 0x30, 0x6c, 0xf3, 0xb2, 0x19, 0x44, 0xaa, 0xcb, 0x4a, 0xf6, 0x75, + 0xb7, 0x09, 0xb9, 0xe1, 0x47, 0x71, 0x70, 0x5c, 0x05, 0x5f, 0x50, 0x50, + 0x9c, 0xd0, 0xe3, 0xc7, 0x91, 0xee, 0x6b, 0xc7, 0x0f, 0x71, 0x1b, 0xc3, + 0x48, 0x8b, 0xed, 0x15, 0x26, 0x8c, 0xc3, 0xd5, 0x54, 0x08, 0xcc, 0x33, + 0x79, 0xc0, 0x9f, 0x49, 0xc8, 0x75, 0xef, 0xb6, 0xf3, 0x29, 0x89, 0xfd, + 0x75, 0xd1, 0xda, 0x92, 0xc3, 0x13, 0xc6, 0x76, 0x51, 0x11, 0x40, 0x7b, + 0x82, 0xf7, 0x30, 0x79, 0x49, 0x04, 0xe3, 0xbb, 0x61, 0x34, 0xa6, 0x58, + 0x0b, 0x7d, 0xef, 0x3e, 0xf9, 0xb3, 0x8d, 0x2a, 0xba, 0xe9, 0xbc, 0xc0, + 0xa7, 0xe6, 0x6c, 0xda, 0xf8, 0x8c, 0xdf, 0x8d, 0x96, 0x83, 0x2d, 0x80, + 0x4f, 0x21, 0x81, 0xde, 0x57, 0x9d, 0x0a, 0x3c, 0xcc, 0xec, 0x3b, 0xb2, + 0x25, 0x96, 0x3c, 0xea, 0xfd, 0x46, 0x26, 0xbe, 0x1c, 0x79, 0x82, 0x1d, + 0xe0, 0x14, 0x22, 0x7c, 0x80, 0x3d, 0xbd, 0x05, 0x90, 0xfa, 0xaf, 0x7d, + 0x70, 0x13, 0x43, 0x0f, 0x3d, 0xa0, 0x7f, 0x92, 0x3a, 0x53, 0x69, 0xe4, + 0xb0, 0x10, 0x0d, 0xa7, 0x73, 0xa8, 0x8c, 0x74, 0xab, 0xd7, 0x78, 0x15, + 0x45, 0xec, 0x6e, 0xc8, 0x8b, 0xa0, 0xba, 0x21, 0x6f, 0xf3, 0x08, 0xb8, + 0xc7, 0x4f, 0x14, 0xf5, 0xcc, 0xfd, 0x39, 0xbc, 0x11, 0xf5, 0xb9, 0x11, + 0xba, 0xf3, 0x11, 0x24, 0x74, 0x3e, 0x0c, 0x07, 0x4f, 0xac, 0x2a, 0xb2, + 0xb1, 0x3c, 0x00, 0xfa, 0xbb, 0x8c, 0xd8, 0x7d, 0x17, 0x5b, 0x8d, 0x39, + 0xc6, 0x23, 0x31, 0x32, 0x7d, 0x6e, 0x20, 0x38, 0xd0, 0xc3, 0x58, 0xe2, + 0xb1, 0xfe, 0x53, 0x6b, 0xc7, 0x10, 0x13, 0x7e, 0xc6, 0x7c, 0x67, 0x59, + 0x43, 0x70, 0x4a, 0x2d, 0x7f, 0x76, 0xde, 0xbd, 0x45, 0x43, 0x56, 0x60, + 0xcd, 0xe9, 0x24, 0x7b, 0xb7, 0x41, 0xce, 0x56, 0xed, 0xd3, 0x74, 0x75, + 0xcc, 0x9d, 0x48, 0x61, 0xc8, 0x19, 0x66, 0x08, 0xfb, 0x28, 0x60, 0x1f, + 0x83, 0x11, 0xc0, 0x9b, 0xbd, 0x71, 0x53, 0x36, 0x01, 0x76, 0xa8, 0xc0, + 0xdc, 0x1d, 0x18, 0x85, 0x19, 0x65, 0xce, 0xcf, 0x14, 0x2e, 0x6c, 0x32, + 0x15, 0xbc, 0x2c, 0x5e, 0x8f, 0xfc, 0x3c, 0xf0, 0x2d, 0xf5, 0x5c, 0x04, + 0xc9, 0x22, 0xf4, 0xc3, 0xb8, 0x57, 0x79, 0x52, 0x41, 0xfd, 0xff, 0xcd, + 0x26, 0xa8, 0xc0, 0xd2, 0xe1, 0x71, 0xd6, 0xf1, 0xf4, 0x0c, 0xa8, 0xeb, + 0x0c, 0x33, 0x40, 0x25, 0x73, 0xbb, 0x31, 0xda, 0x0c, 0xa6, 0xee, 0x0c, + 0x41, 0x51, 0x94, 0x3c, 0x24, 0x27, 0x65, 0xe9, 0xb5, 0xc4, 0xe2, 0x88, + 0xc0, 0x82, 0xd0, 0x72, 0xd9, 0x10, 0x4d, 0x7f, 0xc0, 0x88, 0x94, 0x41, + 0x2d, 0x05, 0x09, 0xfb, 0x97, 0x31, 0x6e, 0xc1, 0xe9, 0xf4, 0x50, 0x70, + 0xdc, 0x3f, 0x0a, 0x90, 0x46, 0x37, 0x60, 0x8c, 0xfb, 0x06, 0x6e, 0xde, + 0x6f, 0xa7, 0x6b, 0xa3, 0x88, 0x18, 0x96, 0x93, 0x19, 0x87, 0xe7, 0x0a, + 0x98, 0xf0, 0x13, 0x01, 0xab, 0x7c, 0xeb, 0x25, 0xa5, 0xe2, 0x98, 0x44, + 0x7d, 0x09, 0xe2, 0x42, 0x33, 0xd4, 0xeb, 0xcc, 0x9b, 0x70, 0xf6, 0x0f, + 0xf0, 0xb2, 0x99, 0xcc, 0x4f, 0x64, 0xc4, 0x69, 0x12, 0xea, 0x56, 0xfe, + 0x50, 0x0e, 0x02, 0x1f, 0x6d, 0x7a, 0x79, 0x62, 0xaa, 0x2e, 0x52, 0xaf, + 0xa3, 0xed, 0xcd, 0xa7, 0x45, 0xe6, 0x86, 0xed, 0xa1, 0x73, 0x5b, 0x1e, + 0x49, 0x4f, 0x92, 0x50, 0x83, 0x99, 0x3c, 0xf4, 0xf6, 0xa8, 0x49, 0xd7, + 0x08, 0xf7, 0xdc, 0x28, 0x2c, 0xe6, 0x22, 0x6f, 0xf8, 0xfa, 0xba, 0x9e, + 0x0a, 0xcf, 0x72, 0x74, 0x76, 0x75, 0x99, 0x4d, 0x3d, 0x9a, 0x4c, 0x54, + 0xcd, 0xf8, 0x54, 0xf0, 0xbd, 0x73, 0xe9, 0x4f, 0x29, 0xd0, 0xe1, 0x24, + 0x94, 0x52, 0xd6, 0x60, 0x80, 0x71, 0x24, 0x95, 0x92, 0x01, 0x0e, 0xa9, + 0x7e, 0x64, 0x2e, 0xed, 0x51, 0xcc, 0xd2, 0xff, 0xfd, 0x0b, + }; + EXPECT_EQ(Bytes(ciphertext), Bytes(kExpectedCiphertext)); + + static const uint8_t kExpectedSharedKey[HRSS_KEY_BYTES] = { + 0xbc, 0x98, 0x9c, 0x9c, 0x1f, 0x57, 0x6f, 0x38, 0x0b, 0x5d, 0xc2, + 0x23, 0x7d, 0x01, 0xae, 0x63, 0x17, 0xe8, 0xe4, 0xb2, 0x02, 0xa7, + 0xc4, 0x3a, 0x1b, 0x5a, 0xf3, 0xf8, 0xb5, 0xea, 0x6e, 0x22, + }; + EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedSharedKey)); + + HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext)); + EXPECT_EQ(Bytes(shared_key, sizeof(shared_key)), + Bytes(kExpectedSharedKey, sizeof(kExpectedSharedKey))); + + // Corrupt the ciphertext and ensure that the failure key is constant. + ciphertext[50] ^= 4; + HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext)); + + static const uint8_t kExpectedFailureKey[HRSS_KEY_BYTES] = { + 0x8e, 0x19, 0xfe, 0x2b, 0x12, 0x67, 0xef, 0x9a, 0x63, 0x4d, 0x79, + 0x33, 0x8c, 0xce, 0xbf, 0x03, 0xdb, 0x9c, 0xc4, 0xc1, 0x70, 0xe1, + 0x32, 0xa6, 0xb3, 0xd3, 0xa1, 0x43, 0x3c, 0xf1, 0x1f, 0x5a, + }; + EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedFailureKey)); +} diff --git a/src/crypto/hrss/internal.h b/src/crypto/hrss/internal.h new file mode 100644 index 00000000..70218b88 --- /dev/null +++ b/src/crypto/hrss/internal.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_HRSS_INTERNAL_H +#define OPENSSL_HEADER_HRSS_INTERNAL_H + +#include <openssl/base.h> +#include "../internal.h" + +#if defined(__cplusplus) +extern "C" { +#endif + + +#define N 701 +#define BITS_PER_WORD (sizeof(crypto_word_t) * 8) +#define WORDS_PER_POLY ((N + BITS_PER_WORD - 1) / BITS_PER_WORD) +#define BITS_IN_LAST_WORD (N % BITS_PER_WORD) + +struct poly2 { + crypto_word_t v[WORDS_PER_POLY]; +}; + +struct poly3 { + struct poly2 s, a; +}; + +OPENSSL_EXPORT void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits); +OPENSSL_EXPORT void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, + const struct poly3 *y); +OPENSSL_EXPORT void HRSS_poly3_invert(struct poly3 *out, + const struct poly3 *in); + + +#if defined(__cplusplus) +} // extern "C" +#endif + +#endif // !OPENSSL_HEADER_HRSS_INTERNAL_H diff --git a/src/crypto/obj/obj_dat.h b/src/crypto/obj/obj_dat.h index 0f5a3fa0..0313a08a 100644 --- a/src/crypto/obj/obj_dat.h +++ b/src/crypto/obj/obj_dat.h @@ -57,7 +57,7 @@ /* This file is generated by crypto/obj/objects.go. */ -#define NUM_NID 959 +#define NUM_NID 960 static const uint8_t kObjectData[] = { /* NID_rsadsi */ @@ -8755,6 +8755,7 @@ static const ASN1_OBJECT kObjects[NUM_NID] = { {"AuthPSK", "auth-psk", NID_auth_psk, 0, NULL, 0}, {"KxANY", "kx-any", NID_kx_any, 0, NULL, 0}, {"AuthANY", "auth-any", NID_auth_any, 0, NULL, 0}, + {"CECPQ2", "CECPQ2", NID_CECPQ2, 0, NULL, 0}, }; static const unsigned kNIDsInShortNameOrder[] = { @@ -8816,6 +8817,7 @@ static const unsigned kNIDsInShortNameOrder[] = { 110 /* CAST5-CFB */, 109 /* CAST5-ECB */, 111 /* CAST5-OFB */, + 959 /* CECPQ2 */, 894 /* CMAC */, 13 /* CN */, 141 /* CRLReason */, @@ -9720,6 +9722,7 @@ static const unsigned kNIDsInLongNameOrder[] = { 285 /* Biometric Info */, 179 /* CA Issuers */, 785 /* CA Repository */, + 959 /* CECPQ2 */, 131 /* Code Signing */, 783 /* Diffie-Hellman based MAC */, 382 /* Directory */, diff --git a/src/crypto/obj/obj_mac.num b/src/crypto/obj/obj_mac.num index 6dbc0f13..5fa839d2 100644 --- a/src/crypto/obj/obj_mac.num +++ b/src/crypto/obj/obj_mac.num @@ -947,3 +947,4 @@ auth_ecdsa 955 auth_psk 956 kx_any 957 auth_any 958 +CECPQ2 959 diff --git a/src/crypto/obj/objects.txt b/src/crypto/obj/objects.txt index 0c48e3c0..6dbb7ad7 100644 --- a/src/crypto/obj/objects.txt +++ b/src/crypto/obj/objects.txt @@ -559,7 +559,7 @@ id-cmc 19 : id-cmc-responseInfo id-cmc 21 : id-cmc-queryPending id-cmc 22 : id-cmc-popLinkRandom id-cmc 23 : id-cmc-popLinkWitness -id-cmc 24 : id-cmc-confirmCertAcceptance +id-cmc 24 : id-cmc-confirmCertAcceptance # other names id-on 1 : id-on-personalData @@ -1239,7 +1239,7 @@ cryptocom 1 8 1 : id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se # Definitions for Camellia cipher - ECB, CFB, OFB MODE !Alias ntt-ds 0 3 4401 5 -!Alias camellia ntt-ds 3 1 9 +!Alias camellia ntt-ds 3 1 9 camellia 1 : CAMELLIA-128-ECB : camellia-128-ecb !Cname camellia-128-ofb128 @@ -1310,7 +1310,7 @@ ISO-US 10046 2 1 : dhpublicnumber : X9.42 DH 1 3 36 3 3 2 8 1 1 11 : brainpoolP384r1 1 3 36 3 3 2 8 1 1 12 : brainpoolP384t1 1 3 36 3 3 2 8 1 1 13 : brainpoolP512r1 -1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1 +1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1 # ECDH schemes from RFC5753 !Alias x9-63-scheme 1 3 133 16 840 63 0 @@ -1334,6 +1334,9 @@ secg-scheme 14 3 : dhSinglePass-cofactorDH-sha512kdf-scheme # NID for X25519 (no corresponding OID). : X25519 +# NID for CECPQ2 (no corresponding OID). + : CECPQ2 + # See RFC 8410. 1 3 101 112 : ED25519 diff --git a/src/crypto/thread_win.c b/src/crypto/thread_win.c index 45011650..c8e19f51 100644 --- a/src/crypto/thread_win.c +++ b/src/crypto/thread_win.c @@ -82,7 +82,7 @@ void CRYPTO_STATIC_MUTEX_unlock_write(struct CRYPTO_STATIC_MUTEX *lock) { ReleaseSRWLockExclusive(&lock->lock); } -static CRITICAL_SECTION g_destructors_lock; +static SRWLOCK g_destructors_lock = SRWLOCK_INIT; static thread_local_destructor_t g_destructors[NUM_OPENSSL_THREAD_LOCALS]; static CRYPTO_once_t g_thread_local_init_once = CRYPTO_ONCE_INIT; @@ -90,10 +90,6 @@ static DWORD g_thread_local_key; static int g_thread_local_failed; static void thread_local_init(void) { - if (!InitializeCriticalSectionAndSpinCount(&g_destructors_lock, 0x400)) { - g_thread_local_failed = 1; - return; - } g_thread_local_key = TlsAlloc(); g_thread_local_failed = (g_thread_local_key == TLS_OUT_OF_INDEXES); } @@ -121,12 +117,11 @@ static void NTAPI thread_local_destructor(PVOID module, DWORD reason, thread_local_destructor_t destructors[NUM_OPENSSL_THREAD_LOCALS]; - EnterCriticalSection(&g_destructors_lock); + AcquireSRWLockExclusive(&g_destructors_lock); OPENSSL_memcpy(destructors, g_destructors, sizeof(destructors)); - LeaveCriticalSection(&g_destructors_lock); + ReleaseSRWLockExclusive(&g_destructors_lock); - unsigned i; - for (i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) { + for (unsigned i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) { if (destructors[i] != NULL) { destructors[i](pointers[i]); } @@ -250,9 +245,9 @@ int CRYPTO_set_thread_local(thread_local_data_t index, void *value, } } - EnterCriticalSection(&g_destructors_lock); + AcquireSRWLockExclusive(&g_destructors_lock); g_destructors[index] = destructor; - LeaveCriticalSection(&g_destructors_lock); + ReleaseSRWLockExclusive(&g_destructors_lock); pointers[index] = value; return 1; diff --git a/src/crypto/x509/x509_test.cc b/src/crypto/x509/x509_test.cc index c42a7c82..a53ed7a6 100644 --- a/src/crypto/x509/x509_test.cc +++ b/src/crypto/x509/x509_test.cc @@ -12,6 +12,7 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include <algorithm> #include <functional> #include <string> #include <vector> @@ -1684,3 +1685,62 @@ TEST(X509Test, ReadBIOEmpty) { EXPECT_EQ(ERR_LIB_ASN1, ERR_GET_LIB(err)); EXPECT_EQ(ASN1_R_HEADER_TOO_LONG, ERR_GET_REASON(err)); } + +TEST(X509Test, ReadBIOOneByte) { + bssl::UniquePtr<BIO> bio(BIO_new_mem_buf("\x30", 1)); + ASSERT_TRUE(bio); + + // CPython expects |ASN1_R_HEADER_TOO_LONG| on EOF, to terminate a series of + // certificates. This EOF appeared after some data, however, so we do not wish + // to signal EOF. + bssl::UniquePtr<X509> x509(d2i_X509_bio(bio.get(), nullptr)); + EXPECT_FALSE(x509); + uint32_t err = ERR_get_error(); + EXPECT_EQ(ERR_LIB_ASN1, ERR_GET_LIB(err)); + EXPECT_EQ(ASN1_R_NOT_ENOUGH_DATA, ERR_GET_REASON(err)); +} + +TEST(X509Test, PartialBIOReturn) { + // Create a filter BIO that only reads and writes one byte at a time. + bssl::UniquePtr<BIO_METHOD> method(BIO_meth_new(0, nullptr)); + ASSERT_TRUE(method); + ASSERT_TRUE(BIO_meth_set_create(method.get(), [](BIO *b) -> int { + BIO_set_init(b, 1); + return 1; + })); + ASSERT_TRUE( + BIO_meth_set_read(method.get(), [](BIO *b, char *out, int len) -> int { + return BIO_read(BIO_next(b), out, std::min(len, 1)); + })); + ASSERT_TRUE(BIO_meth_set_write( + method.get(), [](BIO *b, const char *in, int len) -> int { + return BIO_write(BIO_next(b), in, std::min(len, 1)); + })); + + bssl::UniquePtr<BIO> bio(BIO_new(method.get())); + ASSERT_TRUE(bio); + BIO *mem_bio = BIO_new(BIO_s_mem()); + ASSERT_TRUE(mem_bio); + BIO_push(bio.get(), mem_bio); // BIO_push takes ownership. + + bssl::UniquePtr<X509> cert(CertFromPEM(kLeafPEM)); + ASSERT_TRUE(cert); + uint8_t *der = nullptr; + int der_len = i2d_X509(cert.get(), &der); + ASSERT_GT(der_len, 0); + bssl::UniquePtr<uint8_t> free_der(der); + + // Write the certificate into the BIO. Though we only write one byte at a + // time, the write should succeed. + ASSERT_EQ(1, i2d_X509_bio(bio.get(), cert.get())); + const uint8_t *der2; + size_t der2_len; + ASSERT_TRUE(BIO_mem_contents(mem_bio, &der2, &der2_len)); + EXPECT_EQ(Bytes(der, static_cast<size_t>(der_len)), Bytes(der2, der2_len)); + + // Read the certificate back out of the BIO. Though we only read one byte at a + // time, the read should succeed. + bssl::UniquePtr<X509> cert2(d2i_X509_bio(bio.get(), nullptr)); + ASSERT_TRUE(cert2); + EXPECT_EQ(0, X509_cmp(cert.get(), cert2.get())); +} diff --git a/src/include/openssl/bio.h b/src/include/openssl/bio.h index 8e2db65f..da0dcdfe 100644 --- a/src/include/openssl/bio.h +++ b/src/include/openssl/bio.h @@ -904,6 +904,7 @@ BSSL_NAMESPACE_BEGIN BORINGSSL_MAKE_DELETER(BIO, BIO_free) BORINGSSL_MAKE_UP_REF(BIO, BIO_up_ref) +BORINGSSL_MAKE_DELETER(BIO_METHOD, BIO_meth_free) BSSL_NAMESPACE_END diff --git a/src/include/openssl/bn.h b/src/include/openssl/bn.h index c895cc14..c198f4df 100644 --- a/src/include/openssl/bn.h +++ b/src/include/openssl/bn.h @@ -160,7 +160,7 @@ extern "C" { #define BN_DEC_FMT1 "%" PRIu32 #define BN_DEC_FMT2 "%09" PRIu32 #define BN_HEX_FMT1 "%" PRIx32 -#define BN_HEX_FMT2 "%08" PRIx64 +#define BN_HEX_FMT2 "%08" PRIx32 #else #error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT" #endif diff --git a/src/include/openssl/hrss.h b/src/include/openssl/hrss.h new file mode 100644 index 00000000..cc5edffb --- /dev/null +++ b/src/include/openssl/hrss.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_HRSS_H +#define OPENSSL_HEADER_HRSS_H + +#include <openssl/base.h> + +#if defined(__cplusplus) +extern "C" { +#endif + +// HRSS +// +// HRSS is a structured-lattice-based post-quantum key encapsulation mechanism. +// The best exposition is https://eprint.iacr.org/2017/667.pdf although this +// implementation uses a different KEM construction based on +// https://eprint.iacr.org/2017/1005.pdf. + +struct HRSS_private_key { + uint8_t opaque[1808]; +}; + +struct HRSS_public_key { + uint8_t opaque[1424]; +}; + +// HRSS_SAMPLE_BYTES is the number of bytes of entropy needed to generate a +// short vector. There are 701 coefficients, but the final one is always set to +// zero when sampling. Otherwise, one byte of input is enough to generate two +// coefficients. +#define HRSS_SAMPLE_BYTES ((701 - 1) / 2) +// HRSS_GENERATE_KEY_BYTES is the number of bytes of entropy needed to generate +// an HRSS key pair. +#define HRSS_GENERATE_KEY_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32) +// HRSS_ENCAP_BYTES is the number of bytes of entropy needed to encapsulate a +// session key. +#define HRSS_ENCAP_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES) +// HRSS_PUBLIC_KEY_BYTES is the number of bytes in a public key. +#define HRSS_PUBLIC_KEY_BYTES 1138 +// HRSS_CIPHERTEXT_BYTES is the number of bytes in a ciphertext. +#define HRSS_CIPHERTEXT_BYTES 1138 +// HRSS_KEY_BYTES is the number of bytes in a shared key. +#define HRSS_KEY_BYTES 32 +// HRSS_POLY3_BYTES is the number of bytes needed to serialise a mod 3 +// polynomial. +#define HRSS_POLY3_BYTES 140 +#define HRSS_PRIVATE_KEY_BYTES \ + (HRSS_POLY3_BYTES * 2 + HRSS_PUBLIC_KEY_BYTES + 2 + 32) + +// HRSS_generate_key is a deterministic function that outputs a public and +// private key based on the given entropy. +OPENSSL_EXPORT void HRSS_generate_key( + struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, + const uint8_t input[HRSS_GENERATE_KEY_BYTES]); + +// HRSS_encap is a deterministic function the generates and encrypts a random +// session key from the given entropy, writing those values to |out_shared_key| +// and |out_ciphertext|, respectively. +OPENSSL_EXPORT void HRSS_encap(uint8_t out_ciphertext[HRSS_CIPHERTEXT_BYTES], + uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_public_key *in_pub, + const uint8_t in[HRSS_ENCAP_BYTES]); + +// HRSS_decap decrypts a session key from |ciphertext_len| bytes of +// |ciphertext|. If the ciphertext is valid, the decrypted key is written to +// |out_shared_key|. Otherwise the HMAC of |ciphertext| under a secret key (kept +// in |in_priv|) is written. If the ciphertext is the wrong length then it will +// leak which was done via side-channels. Otherwise it should perform either +// action in constant-time. +OPENSSL_EXPORT void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_public_key *in_pub, + const struct HRSS_private_key *in_priv, + const uint8_t *ciphertext, + size_t ciphertext_len); + +// HRSS_marshal_public_key serialises |in_pub| to |out|. +OPENSSL_EXPORT void HRSS_marshal_public_key( + uint8_t out[HRSS_PUBLIC_KEY_BYTES], const struct HRSS_public_key *in_pub); + +// HRSS_parse_public_key sets |*out| to the public-key encoded in |in|. It +// returns true on success and zero on error. +OPENSSL_EXPORT int HRSS_parse_public_key( + struct HRSS_public_key *out, const uint8_t in[HRSS_PUBLIC_KEY_BYTES]); + + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // OPENSSL_HEADER_HRSS_H diff --git a/src/include/openssl/nid.h b/src/include/openssl/nid.h index afeb2dea..270d443a 100644 --- a/src/include/openssl/nid.h +++ b/src/include/openssl/nid.h @@ -4234,6 +4234,9 @@ extern "C" { #define LN_auth_any "auth-any" #define NID_auth_any 958 +#define SN_CECPQ2 "CECPQ2" +#define NID_CECPQ2 959 + #if defined(__cplusplus) } /* extern C */ diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h index 17c55925..6898674a 100644 --- a/src/include/openssl/ssl.h +++ b/src/include/openssl/ssl.h @@ -2177,6 +2177,7 @@ OPENSSL_EXPORT int SSL_set1_curves_list(SSL *ssl, const char *curves); #define SSL_CURVE_SECP384R1 24 #define SSL_CURVE_SECP521R1 25 #define SSL_CURVE_X25519 29 +#define SSL_CURVE_CECPQ2 16696 // SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently // completed handshake or 0 if not applicable. @@ -4715,6 +4716,14 @@ OPENSSL_EXPORT bool SSL_apply_handoff(SSL *ssl, Span<const uint8_t> handoff); OPENSSL_EXPORT bool SSL_serialize_handback(const SSL *ssl, CBB *out); OPENSSL_EXPORT bool SSL_apply_handback(SSL *ssl, Span<const uint8_t> handback); +// SSL_get_traffic_secrets sets |*out_read_traffic_secret| and +// |*out_write_traffic_secret| to reference the TLS 1.3 traffic secrets for +// |ssl|. This function is only valid on TLS 1.3 connections that have +// completed the handshake. It returns true on success and false on error. +OPENSSL_EXPORT bool SSL_get_traffic_secrets( + const SSL *ssl, Span<const uint8_t> *out_read_traffic_secret, + Span<const uint8_t> *out_write_traffic_secret); + BSSL_NAMESPACE_END } // extern C++ diff --git a/src/ssl/handoff.cc b/src/ssl/handoff.cc index 4cca9818..f9dbd135 100644 --- a/src/ssl/handoff.cc +++ b/src/ssl/handoff.cc @@ -307,7 +307,7 @@ bool SSL_serialize_handback(const SSL *ssl, CBB *out) { return false; } if (type == handback_after_ecdhe && - !s3->hs->key_share->Serialize(&key_share)) { + !s3->hs->key_shares[0]->Serialize(&key_share)) { return false; } return CBB_flush(out); @@ -471,7 +471,7 @@ bool SSL_apply_handback(SSL *ssl, Span<const uint8_t> handback) { return false; } if (type == handback_after_ecdhe && - (s3->hs->key_share = SSLKeyShare::Create(&key_share)) == nullptr) { + (s3->hs->key_shares[0] = SSLKeyShare::Create(&key_share)) == nullptr) { return false; } diff --git a/src/ssl/handshake_client.cc b/src/ssl/handshake_client.cc index c1d54bd8..0274dc2a 100644 --- a/src/ssl/handshake_client.cc +++ b/src/ssl/handshake_client.cc @@ -590,7 +590,8 @@ static enum ssl_hs_wait_t do_read_server_hello(SSL_HANDSHAKE *hs) { } // Clear some TLS 1.3 state that no longer needs to be retained. - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); hs->key_share_bytes.Reset(); // A TLS 1.2 server would not know to skip the early data we offered. Report @@ -1006,8 +1007,8 @@ static enum ssl_hs_wait_t do_read_server_key_exchange(SSL_HANDSHAKE *hs) { } // Initialize ECDH and save the peer public key for later. - hs->key_share = SSLKeyShare::Create(group_id); - if (!hs->key_share || + hs->key_shares[0] = SSLKeyShare::Create(group_id); + if (!hs->key_shares[0] || !hs->peer_key.CopyFrom(point)) { return ssl_hs_error; } @@ -1324,7 +1325,7 @@ static enum ssl_hs_wait_t do_send_client_key_exchange(SSL_HANDSHAKE *hs) { // Compute the premaster. uint8_t alert = SSL_AD_DECODE_ERROR; - if (!hs->key_share->Accept(&child, &pms, &alert, hs->peer_key)) { + if (!hs->key_shares[0]->Accept(&child, &pms, &alert, hs->peer_key)) { ssl_send_alert(ssl, SSL3_AL_FATAL, alert); return ssl_hs_error; } @@ -1333,7 +1334,8 @@ static enum ssl_hs_wait_t do_send_client_key_exchange(SSL_HANDSHAKE *hs) { } // The key exchange state may now be discarded. - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); hs->peer_key.Reset(); } else if (alg_k & SSL_kPSK) { // For plain PSK, other_secret is a block of 0s with the same length as diff --git a/src/ssl/handshake_server.cc b/src/ssl/handshake_server.cc index c4f3b75e..8b3b9428 100644 --- a/src/ssl/handshake_server.cc +++ b/src/ssl/handshake_server.cc @@ -932,12 +932,12 @@ static enum ssl_hs_wait_t do_send_server_certificate(SSL_HANDSHAKE *hs) { hs->new_session->group_id = group_id; // Set up ECDH, generate a key, and emit the public half. - hs->key_share = SSLKeyShare::Create(group_id); - if (!hs->key_share || + hs->key_shares[0] = SSLKeyShare::Create(group_id); + if (!hs->key_shares[0] || !CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) || !CBB_add_u16(cbb.get(), group_id) || !CBB_add_u8_length_prefixed(cbb.get(), &child) || - !hs->key_share->Offer(&child)) { + !hs->key_shares[0]->Offer(&child)) { return ssl_hs_error; } } else { @@ -1275,13 +1275,14 @@ static enum ssl_hs_wait_t do_read_client_key_exchange(SSL_HANDSHAKE *hs) { // Compute the premaster. uint8_t alert = SSL_AD_DECODE_ERROR; - if (!hs->key_share->Finish(&premaster_secret, &alert, peer_key)) { + if (!hs->key_shares[0]->Finish(&premaster_secret, &alert, peer_key)) { ssl_send_alert(ssl, SSL3_AL_FATAL, alert); return ssl_hs_error; } // The key exchange state may now be discarded. - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); } else if (!(alg_k & SSL_kPSK)) { OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); diff --git a/src/ssl/internal.h b/src/ssl/internal.h index f8a2ea70..bbce7ec4 100644 --- a/src/ssl/internal.h +++ b/src/ssl/internal.h @@ -974,10 +974,10 @@ class SSLKeyShare { // |out_public_key|. It returns true on success and false on error. virtual bool Offer(CBB *out_public_key) PURE_VIRTUAL; - // Accept performs a key exchange against the |peer_key| generated by |offer|. + // Accept performs a key exchange against the |peer_key| generated by |Offer|. // On success, it returns true, writes the public value to |out_public_key|, - // and sets |*out_secret| the shared secret. On failure, it returns false and - // sets |*out_alert| to an alert to send to the peer. + // and sets |*out_secret| to the shared secret. On failure, it returns false + // and sets |*out_alert| to an alert to send to the peer. // // The default implementation calls |Offer| and then |Finish|, assuming a key // exchange protocol where the peers are symmetric. @@ -986,7 +986,7 @@ class SSLKeyShare { // Finish performs a key exchange against the |peer_key| generated by // |Accept|. On success, it returns true and sets |*out_secret| to the shared - // secret. On failure, it returns zero and sets |*out_alert| to an alert to + // secret. On failure, it returns false and sets |*out_alert| to an alert to // send to the peer. virtual bool Finish(Array<uint8_t> *out_secret, uint8_t *out_alert, Span<const uint8_t> peer_key) PURE_VIRTUAL; @@ -1436,8 +1436,10 @@ struct SSL_HANDSHAKE { // error, if |wait| is |ssl_hs_error|, is the error the handshake failed on. UniquePtr<ERR_SAVE_STATE> error; - // key_share is the current key exchange instance. - UniquePtr<SSLKeyShare> key_share; + // key_shares are the current key exchange instances. The second is only used + // as a client if we believe that we should offer two key shares in a + // ClientHello. + UniquePtr<SSLKeyShare> key_shares[2]; // transcript is the current handshake transcript. SSLTranscript transcript; diff --git a/src/ssl/ssl_asn1.cc b/src/ssl/ssl_asn1.cc index 669f776d..3fd7fb6a 100644 --- a/src/ssl/ssl_asn1.cc +++ b/src/ssl/ssl_asn1.cc @@ -697,11 +697,6 @@ UniquePtr<SSL_SESSION> SSL_SESSION_parse(CBS *cbs, } } - if (!x509_method->session_cache_objects(ret.get())) { - OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION); - return nullptr; - } - CBS age_add; int age_add_present; if (!CBS_get_optional_asn1_octet_string(&session, &age_add, &age_add_present, @@ -737,6 +732,11 @@ UniquePtr<SSL_SESSION> SSL_SESSION_parse(CBS *cbs, return nullptr; } + if (!x509_method->session_cache_objects(ret.get())) { + OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION); + return nullptr; + } + return ret; } diff --git a/src/ssl/ssl_key_share.cc b/src/ssl/ssl_key_share.cc index 80b7d0a0..108ea6a9 100644 --- a/src/ssl/ssl_key_share.cc +++ b/src/ssl/ssl_key_share.cc @@ -24,8 +24,10 @@ #include <openssl/curve25519.h> #include <openssl/ec.h> #include <openssl/err.h> +#include <openssl/hrss.h> #include <openssl/mem.h> #include <openssl/nid.h> +#include <openssl/rand.h> #include "internal.h" #include "../crypto/internal.h" @@ -38,7 +40,6 @@ namespace { class ECKeyShare : public SSLKeyShare { public: ECKeyShare(int nid, uint16_t group_id) : nid_(nid), group_id_(group_id) {} - ~ECKeyShare() override {} uint16_t GroupID() const override { return group_id_; } @@ -159,9 +160,6 @@ class ECKeyShare : public SSLKeyShare { class X25519KeyShare : public SSLKeyShare { public: X25519KeyShare() {} - ~X25519KeyShare() override { - OPENSSL_cleanse(private_key_, sizeof(private_key_)); - } uint16_t GroupID() const override { return SSL_CURVE_X25519; } @@ -211,12 +209,104 @@ class X25519KeyShare : public SSLKeyShare { uint8_t private_key_[32]; }; +class CECPQ2KeyShare : public SSLKeyShare { + public: + CECPQ2KeyShare() {} + + uint16_t GroupID() const override { return SSL_CURVE_CECPQ2; } + + bool Offer(CBB *out) override { + uint8_t x25519_public_key[32]; + X25519_keypair(x25519_public_key, x25519_private_key_); + + uint8_t hrss_entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(hrss_entropy, sizeof(hrss_entropy)); + HRSS_generate_key(&hrss_public_key_, &hrss_private_key_, hrss_entropy); + + uint8_t hrss_public_key_bytes[HRSS_PUBLIC_KEY_BYTES]; + HRSS_marshal_public_key(hrss_public_key_bytes, &hrss_public_key_); + + if (!CBB_add_bytes(out, x25519_public_key, sizeof(x25519_public_key)) || + !CBB_add_bytes(out, hrss_public_key_bytes, + sizeof(hrss_public_key_bytes))) { + return false; + } + + return true; + }; + + bool Accept(CBB *out_public_key, Array<uint8_t> *out_secret, + uint8_t *out_alert, Span<const uint8_t> peer_key) override { + Array<uint8_t> secret; + if (!secret.Init(32 + HRSS_KEY_BYTES)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return false; + } + + uint8_t x25519_public_key[32]; + X25519_keypair(x25519_public_key, x25519_private_key_); + + HRSS_public_key peer_public_key; + if (peer_key.size() != 32 + HRSS_PUBLIC_KEY_BYTES || + !HRSS_parse_public_key(&peer_public_key, peer_key.data() + 32) || + !X25519(secret.data(), x25519_private_key_, peer_key.data())) { + *out_alert = SSL_AD_DECODE_ERROR; + OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); + return false; + } + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t entropy[HRSS_ENCAP_BYTES]; + RAND_bytes(entropy, sizeof(entropy)); + HRSS_encap(ciphertext, secret.data() + 32, &peer_public_key, entropy); + + if (!CBB_add_bytes(out_public_key, x25519_public_key, + sizeof(x25519_public_key)) || + !CBB_add_bytes(out_public_key, ciphertext, sizeof(ciphertext))) { + return false; + } + + *out_secret = std::move(secret); + return true; + } + + bool Finish(Array<uint8_t> *out_secret, uint8_t *out_alert, + Span<const uint8_t> peer_key) override { + *out_alert = SSL_AD_INTERNAL_ERROR; + + Array<uint8_t> secret; + if (!secret.Init(32 + HRSS_KEY_BYTES)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return false; + } + + if (peer_key.size() != 32 + HRSS_CIPHERTEXT_BYTES || + !X25519(secret.data(), x25519_private_key_, peer_key.data())) { + *out_alert = SSL_AD_DECODE_ERROR; + OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); + return false; + } + + HRSS_decap(secret.data() + 32, &hrss_public_key_, &hrss_private_key_, + peer_key.data() + 32, peer_key.size() - 32); + + *out_secret = std::move(secret); + return true; + }; + + private: + uint8_t x25519_private_key_[32]; + HRSS_public_key hrss_public_key_; + HRSS_private_key hrss_private_key_; +}; + CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = { {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"}, {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"}, {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"}, {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"}, {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"}, + {NID_CECPQ2, SSL_CURVE_CECPQ2, "CECPQ2", "CECPQ2"}, }; } // namespace @@ -241,6 +331,8 @@ UniquePtr<SSLKeyShare> SSLKeyShare::Create(uint16_t group_id) { New<ECKeyShare>(NID_secp521r1, SSL_CURVE_SECP521R1)); case SSL_CURVE_X25519: return UniquePtr<SSLKeyShare>(New<X25519KeyShare>()); + case SSL_CURVE_CECPQ2: + return UniquePtr<SSLKeyShare>(New<CECPQ2KeyShare>()); default: return nullptr; } diff --git a/src/ssl/ssl_lib.cc b/src/ssl/ssl_lib.cc index b9c823d9..ceeba89c 100644 --- a/src/ssl/ssl_lib.cc +++ b/src/ssl/ssl_lib.cc @@ -506,6 +506,27 @@ void SSL_set_handoff_mode(SSL *ssl, bool on) { ssl->config->handoff = on; } +bool SSL_get_traffic_secrets(const SSL *ssl, + Span<const uint8_t> *out_read_traffic_secret, + Span<const uint8_t> *out_write_traffic_secret) { + if (SSL_version(ssl) < TLS1_3_VERSION) { + OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_SSL_VERSION); + return false; + } + + if (!ssl->s3->initial_handshake_complete) { + OPENSSL_PUT_ERROR(SSL, SSL_R_HANDSHAKE_NOT_COMPLETE); + return false; + } + + *out_read_traffic_secret = Span<const uint8_t>( + ssl->s3->read_traffic_secret, ssl->s3->read_traffic_secret_len); + *out_write_traffic_secret = Span<const uint8_t>( + ssl->s3->write_traffic_secret, ssl->s3->write_traffic_secret_len); + + return true; +} + BSSL_NAMESPACE_END using namespace bssl; diff --git a/src/ssl/ssl_test.cc b/src/ssl/ssl_test.cc index 470379c0..8d01c03a 100644 --- a/src/ssl/ssl_test.cc +++ b/src/ssl/ssl_test.cc @@ -395,6 +395,11 @@ static const CurveTest kCurveTests[] = { { SSL_CURVE_SECP256R1 }, }, { + "P-256:CECPQ2", + { SSL_CURVE_SECP256R1, SSL_CURVE_CECPQ2 }, + }, + + { "P-256:P-384:P-521:X25519", { SSL_CURVE_SECP256R1, @@ -4516,6 +4521,65 @@ TEST(SSLTest, GetCertificateThreads) { EXPECT_EQ(cert2, cert2_thread); EXPECT_EQ(0, X509_cmp(cert.get(), cert2)); } + +// Functions which access properties on the negotiated session are thread-safe +// where needed. Prior to TLS 1.3, clients resuming sessions and servers +// performing stateful resumption will share an underlying SSL_SESSION object, +// potentially across threads. +TEST_P(SSLVersionTest, SessionPropertiesThreads) { + if (version() == TLS1_3_VERSION) { + // Our TLS 1.3 implementation does not support stateful resumption. + ASSERT_FALSE(CreateClientSession(client_ctx_.get(), server_ctx_.get())); + return; + } + + SSL_CTX_set_options(server_ctx_.get(), SSL_OP_NO_TICKET); + SSL_CTX_set_session_cache_mode(client_ctx_.get(), SSL_SESS_CACHE_BOTH); + SSL_CTX_set_session_cache_mode(server_ctx_.get(), SSL_SESS_CACHE_BOTH); + + ASSERT_TRUE(UseCertAndKey(client_ctx_.get())); + ASSERT_TRUE(UseCertAndKey(server_ctx_.get())); + + // Configure mutual authentication, so we have more session state. + SSL_CTX_set_custom_verify( + client_ctx_.get(), SSL_VERIFY_PEER, + [](SSL *ssl, uint8_t *out_alert) { return ssl_verify_ok; }); + SSL_CTX_set_custom_verify( + server_ctx_.get(), SSL_VERIFY_PEER, + [](SSL *ssl, uint8_t *out_alert) { return ssl_verify_ok; }); + + // Establish a client session to test with. + bssl::UniquePtr<SSL_SESSION> session = + CreateClientSession(client_ctx_.get(), server_ctx_.get()); + ASSERT_TRUE(session); + + // Resume with it twice. + UniquePtr<SSL> ssls[4]; + ClientConfig config; + config.session = session.get(); + ASSERT_TRUE(ConnectClientAndServer(&ssls[0], &ssls[1], client_ctx_.get(), + server_ctx_.get(), config)); + ASSERT_TRUE(ConnectClientAndServer(&ssls[2], &ssls[3], client_ctx_.get(), + server_ctx_.get(), config)); + + // Read properties in parallel. + auto read_properties = [](const SSL *ssl) { + EXPECT_TRUE(SSL_get_peer_cert_chain(ssl)); + bssl::UniquePtr<X509> peer(SSL_get_peer_certificate(ssl)); + EXPECT_TRUE(peer); + EXPECT_TRUE(SSL_get_current_cipher(ssl)); + EXPECT_TRUE(SSL_get_curve_id(ssl)); + }; + + std::vector<std::thread> threads; + for (const auto &ssl_ptr : ssls) { + const SSL *ssl = ssl_ptr.get(); + threads.emplace_back([=] { read_properties(ssl); }); + } + for (auto &thread : threads) { + thread.join(); + } +} #endif constexpr size_t kNumQUICLevels = 4; diff --git a/src/ssl/ssl_x509.cc b/src/ssl/ssl_x509.cc index ec203b22..eb3a38b7 100644 --- a/src/ssl/ssl_x509.cc +++ b/src/ssl/ssl_x509.cc @@ -281,16 +281,25 @@ static void ssl_crypto_x509_cert_dup(CERT *new_cert, const CERT *cert) { } static int ssl_crypto_x509_session_cache_objects(SSL_SESSION *sess) { - bssl::UniquePtr<STACK_OF(X509)> chain; + bssl::UniquePtr<STACK_OF(X509)> chain, chain_without_leaf; if (sk_CRYPTO_BUFFER_num(sess->certs.get()) > 0) { chain.reset(sk_X509_new_null()); if (!chain) { OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); return 0; } + if (sess->is_server) { + // chain_without_leaf is only needed for server sessions. See + // |SSL_get_peer_cert_chain|. + chain_without_leaf.reset(sk_X509_new_null()); + if (!chain_without_leaf) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return 0; + } + } } - X509 *leaf = nullptr; + bssl::UniquePtr<X509> leaf; for (CRYPTO_BUFFER *cert : sess->certs.get()) { UniquePtr<X509> x509(X509_parse_from_buffer(cert)); if (!x509) { @@ -298,7 +307,11 @@ static int ssl_crypto_x509_session_cache_objects(SSL_SESSION *sess) { return 0; } if (leaf == nullptr) { - leaf = x509.get(); + leaf = UpRef(x509); + } else if (chain_without_leaf && + !PushToStack(chain_without_leaf.get(), UpRef(x509))) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return 0; } if (!PushToStack(chain.get(), std::move(x509))) { OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); @@ -308,26 +321,28 @@ static int ssl_crypto_x509_session_cache_objects(SSL_SESSION *sess) { sk_X509_pop_free(sess->x509_chain, X509_free); sess->x509_chain = chain.release(); + sk_X509_pop_free(sess->x509_chain_without_leaf, X509_free); - sess->x509_chain_without_leaf = NULL; + sess->x509_chain_without_leaf = chain_without_leaf.release(); X509_free(sess->x509_peer); - if (leaf != NULL) { - X509_up_ref(leaf); - } - sess->x509_peer = leaf; + sess->x509_peer = leaf.release(); return 1; } static int ssl_crypto_x509_session_dup(SSL_SESSION *new_session, const SSL_SESSION *session) { - if (session->x509_peer != NULL) { - X509_up_ref(session->x509_peer); - new_session->x509_peer = session->x509_peer; - } - if (session->x509_chain != NULL) { + new_session->x509_peer = UpRef(session->x509_peer).release(); + if (session->x509_chain != nullptr) { new_session->x509_chain = X509_chain_up_ref(session->x509_chain); - if (new_session->x509_chain == NULL) { + if (new_session->x509_chain == nullptr) { + return 0; + } + } + if (session->x509_chain_without_leaf != nullptr) { + new_session->x509_chain_without_leaf = + X509_chain_up_ref(session->x509_chain_without_leaf); + if (new_session->x509_chain_without_leaf == nullptr) { return 0; } } @@ -525,38 +540,17 @@ X509 *SSL_get_peer_certificate(const SSL *ssl) { STACK_OF(X509) *SSL_get_peer_cert_chain(const SSL *ssl) { check_ssl_x509_method(ssl); - if (ssl == NULL) { - return NULL; + if (ssl == nullptr) { + return nullptr; } SSL_SESSION *session = SSL_get_session(ssl); - if (session == NULL || - session->x509_chain == NULL) { - return NULL; - } - - if (!ssl->server) { - return session->x509_chain; + if (session == nullptr) { + return nullptr; } // OpenSSL historically didn't include the leaf certificate in the returned // certificate chain, but only for servers. - if (session->x509_chain_without_leaf == NULL) { - session->x509_chain_without_leaf = sk_X509_new_null(); - if (session->x509_chain_without_leaf == NULL) { - return NULL; - } - - for (size_t i = 1; i < sk_X509_num(session->x509_chain); i++) { - X509 *cert = sk_X509_value(session->x509_chain, i); - if (!PushToStack(session->x509_chain_without_leaf, UpRef(cert))) { - sk_X509_pop_free(session->x509_chain_without_leaf, X509_free); - session->x509_chain_without_leaf = NULL; - return NULL; - } - } - } - - return session->x509_chain_without_leaf; + return ssl->server ? session->x509_chain_without_leaf : session->x509_chain; } STACK_OF(X509) *SSL_get_peer_full_cert_chain(const SSL *ssl) { diff --git a/src/ssl/t1_lib.cc b/src/ssl/t1_lib.cc index 678e4a3b..5e65f819 100644 --- a/src/ssl/t1_lib.cc +++ b/src/ssl/t1_lib.cc @@ -292,10 +292,23 @@ static const uint16_t kDefaultGroups[] = { SSL_CURVE_SECP384R1, }; +// TLS 1.3 servers will pick CECPQ2 if offered by a client, but it's not enabled +// by default for clients. +static const uint16_t kDefaultGroupsServer[] = { + // CECPQ2 is not yet enabled by default. + // SSL_CURVE_CECPQ2, + SSL_CURVE_X25519, + SSL_CURVE_SECP256R1, + SSL_CURVE_SECP384R1, +};; + Span<const uint16_t> tls1_get_grouplist(const SSL_HANDSHAKE *hs) { if (!hs->config->supported_group_list.empty()) { return hs->config->supported_group_list; } + if (hs->ssl->server) { + return Span<const uint16_t>(kDefaultGroupsServer); + } return Span<const uint16_t>(kDefaultGroups); } @@ -324,7 +337,11 @@ bool tls1_get_shared_group(SSL_HANDSHAKE *hs, uint16_t *out_group_id) { for (uint16_t pref_group : pref) { for (uint16_t supp_group : supp) { - if (pref_group == supp_group) { + if (pref_group == supp_group && + // CECPQ2 doesn't fit in the u8-length-prefixed ECPoint field in TLS + // 1.2 and below. + (ssl_protocol_version(ssl) >= TLS1_3_VERSION || + pref_group != SSL_CURVE_CECPQ2)) { *out_group_id = pref_group; return true; } @@ -386,6 +403,12 @@ bool tls1_set_curves_list(Array<uint16_t> *out_group_ids, const char *curves) { } bool tls1_check_group_id(const SSL_HANDSHAKE *hs, uint16_t group_id) { + if (group_id == SSL_CURVE_CECPQ2 && + ssl_protocol_version(hs->ssl) < TLS1_3_VERSION) { + // CECPQ2 requires TLS 1.3. + return false; + } + for (uint16_t supported : tls1_get_grouplist(hs)) { if (supported == group_id) { return true; @@ -1038,7 +1061,6 @@ static bool ext_sigalgs_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert, CBS supported_signature_algorithms; if (!CBS_get_u16_length_prefixed(contents, &supported_signature_algorithms) || CBS_len(contents) != 0 || - CBS_len(&supported_signature_algorithms) == 0 || !tls1_parse_peer_sigalgs(hs, &supported_signature_algorithms)) { return false; } @@ -2145,6 +2167,7 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { } uint16_t group_id = hs->retry_group; + uint16_t second_group_id = 0; if (hs->received_hello_retry_request) { // We received a HelloRetryRequest without a new curve, so there is no new // share to append. Leave |hs->key_share| as-is. @@ -2175,19 +2198,38 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { } group_id = groups[0]; + + if (group_id == SSL_CURVE_CECPQ2 && groups.size() >= 2) { + // CECPQ2 is not sent as the only initial key share. We'll include the + // 2nd preference group too to avoid round-trips. + second_group_id = groups[1]; + assert(second_group_id != group_id); + } } - hs->key_share = SSLKeyShare::Create(group_id); CBB key_exchange; - if (!hs->key_share || + hs->key_shares[0] = SSLKeyShare::Create(group_id); + if (!hs->key_shares[0] || !CBB_add_u16(&kse_bytes, group_id) || !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) || - !hs->key_share->Offer(&key_exchange) || + !hs->key_shares[0]->Offer(&key_exchange) || !CBB_flush(&kse_bytes)) { return false; } - // Save the contents of the extension to repeat it in the second ClientHello. + if (second_group_id != 0) { + hs->key_shares[1] = SSLKeyShare::Create(second_group_id); + if (!hs->key_shares[1] || + !CBB_add_u16(&kse_bytes, second_group_id) || + !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) || + !hs->key_shares[1]->Offer(&key_exchange) || + !CBB_flush(&kse_bytes)) { + return false; + } + } + + // Save the contents of the extension to repeat it in the second + // ClientHello. if (!hs->received_hello_retry_request && !hs->key_share_bytes.CopyFrom( MakeConstSpan(CBB_data(&kse_bytes), CBB_len(&kse_bytes)))) { @@ -2210,19 +2252,24 @@ bool ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, return false; } - if (hs->key_share->GroupID() != group_id) { - *out_alert = SSL_AD_ILLEGAL_PARAMETER; - OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE); - return false; + SSLKeyShare *key_share = hs->key_shares[0].get(); + if (key_share->GroupID() != group_id) { + if (!hs->key_shares[1] || hs->key_shares[1]->GroupID() != group_id) { + *out_alert = SSL_AD_ILLEGAL_PARAMETER; + OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE); + return false; + } + key_share = hs->key_shares[1].get(); } - if (!hs->key_share->Finish(out_secret, out_alert, peer_key)) { + if (!key_share->Finish(out_secret, out_alert, peer_key)) { *out_alert = SSL_AD_INTERNAL_ERROR; return false; } hs->new_session->group_id = group_id; - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); return true; } @@ -2390,6 +2437,10 @@ static bool ext_supported_groups_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { } for (uint16_t group : tls1_get_grouplist(hs)) { + if (group == SSL_CURVE_CECPQ2 && + hs->max_version < TLS1_3_VERSION) { + continue; + } if (!CBB_add_u16(&groups_bytes, group)) { return false; } @@ -3556,7 +3607,10 @@ bool tls1_parse_peer_sigalgs(SSL_HANDSHAKE *hs, const CBS *in_sigalgs) { return true; } - return parse_u16_array(in_sigalgs, &hs->peer_sigalgs); + // In all contexts, the signature algorithms list may not be empty. (It may be + // omitted by clients in TLS 1.2, but then the entire extension is omitted.) + return CBS_len(in_sigalgs) != 0 && + parse_u16_array(in_sigalgs, &hs->peer_sigalgs); } bool tls1_get_legacy_signature_algorithm(uint16_t *out, const EVP_PKEY *pkey) { diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc index 675a08a0..77ed7968 100644 --- a/src/ssl/test/bssl_shim.cc +++ b/src/ssl/test/bssl_shim.cc @@ -649,7 +649,6 @@ static bool DoConnection(bssl::UniquePtr<SSL_SESSION> *out_session, SSL_set_connect_state(ssl.get()); } - int sock = Connect(config->port); if (sock == -1) { return false; @@ -837,6 +836,23 @@ static bool DoExchange(bssl::UniquePtr<SSL_SESSION> *out_session, } } + if (config->export_traffic_secrets) { + bssl::Span<const uint8_t> read_secret, write_secret; + if (!SSL_get_traffic_secrets(ssl, &read_secret, &write_secret)) { + fprintf(stderr, "failed to export traffic secrets\n"); + return false; + } + + assert(read_secret.size() <= 0xffff); + assert(write_secret.size() == read_secret.size()); + const uint16_t secret_len = read_secret.size(); + if (WriteAll(ssl, &secret_len, sizeof(secret_len)) < 0 || + WriteAll(ssl, read_secret.data(), read_secret.size()) < 0 || + WriteAll(ssl, write_secret.data(), write_secret.size()) < 0) { + return false; + } + } + if (config->tls_unique) { uint8_t tls_unique[16]; size_t tls_unique_len; diff --git a/src/ssl/test/runner/cipher_suites.go b/src/ssl/test/runner/cipher_suites.go index f4c59006..3246f0b7 100644 --- a/src/ssl/test/runner/cipher_suites.go +++ b/src/ssl/test/runner/cipher_suites.go @@ -26,7 +26,7 @@ type keyAgreement interface { // In the case that the key agreement protocol doesn't use a // ServerKeyExchange message, generateServerKeyExchange can return nil, // nil. - generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg) (*serverKeyExchangeMsg, error) + generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg, uint16) (*serverKeyExchangeMsg, error) processClientKeyExchange(*Config, *Certificate, *clientKeyExchangeMsg, uint16) ([]byte, error) // On the client side, the next two methods are called in order. diff --git a/src/ssl/test/runner/common.go b/src/ssl/test/runner/common.go index 73b8889e..d99518c9 100644 --- a/src/ssl/test/runner/common.go +++ b/src/ssl/test/runner/common.go @@ -163,6 +163,7 @@ const ( CurveP384 CurveID = 24 CurveP521 CurveID = 25 CurveX25519 CurveID = 29 + CurveCECPQ2 CurveID = 16696 ) // TLS Elliptic Curve Point Formats @@ -1645,6 +1646,18 @@ type ProtocolBugs struct { // ExpectJDK11DowngradeRandom is whether the client should expect the // server to send the JDK 11 downgrade signal. ExpectJDK11DowngradeRandom bool + + // FailIfHelloRetryRequested causes a handshake failure if a server requests a + // hello retry. + FailIfHelloRetryRequested bool + + // FailedIfCECPQ2Offered will cause a server to reject a ClientHello if CECPQ2 + // is supported. + FailIfCECPQ2Offered bool + + // ExpectKeyShares, if not nil, lists (in order) the curves that a ClientHello + // should have key shares for. + ExpectedKeyShares []CurveID } func (c *Config) serverInit() { @@ -1724,7 +1737,7 @@ func (c *Config) maxVersion(isDTLS bool) uint16 { return ret } -var defaultCurvePreferences = []CurveID{CurveX25519, CurveP256, CurveP384, CurveP521} +var defaultCurvePreferences = []CurveID{CurveCECPQ2, CurveX25519, CurveP256, CurveP384, CurveP521} func (c *Config) curvePreferences() []CurveID { if c == nil || len(c.CurvePreferences) == 0 { diff --git a/src/ssl/test/runner/handshake_client.go b/src/ssl/test/runner/handshake_client.go index ab1f4dd2..5234462d 100644 --- a/src/ssl/test/runner/handshake_client.go +++ b/src/ssl/test/runner/handshake_client.go @@ -549,6 +549,9 @@ NextCipherSuite: helloRetryRequest, haveHelloRetryRequest := msg.(*helloRetryRequestMsg) var secondHelloBytes []byte if haveHelloRetryRequest { + if c.config.Bugs.FailIfHelloRetryRequested { + return errors.New("tls: unexpected HelloRetryRequest") + } // Explicitly read the ChangeCipherSpec now; it should // be attached to the first flight, not the second flight. if err := c.readTLS13ChangeCipherSpec(); err != nil { diff --git a/src/ssl/test/runner/handshake_messages.go b/src/ssl/test/runner/handshake_messages.go index e0867a51..823c6c8f 100644 --- a/src/ssl/test/runner/handshake_messages.go +++ b/src/ssl/test/runner/handshake_messages.go @@ -653,6 +653,23 @@ func parseSignatureAlgorithms(reader *byteReader, out *[]signatureAlgorithm, all return true } +func checkDuplicateExtensions(extensions byteReader) bool { + seen := make(map[uint16]struct{}) + for len(extensions) > 0 { + var extension uint16 + var body byteReader + if !extensions.readU16(&extension) || + !extensions.readU16LengthPrefixed(&body) { + return false + } + if _, ok := seen[extension]; ok { + return false + } + seen[extension] = struct{}{} + } + return true +} + func (m *clientHelloMsg) unmarshal(data []byte) bool { m.raw = data reader := byteReader(data[4:]) @@ -707,7 +724,7 @@ func (m *clientHelloMsg) unmarshal(data []byte) bool { } var extensions byteReader - if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 { + if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 || !checkDuplicateExtensions(extensions) { return false } for len(extensions) > 0 { @@ -923,6 +940,13 @@ func (m *clientHelloMsg) unmarshal(data []byte) bool { seen[algID] = struct{}{} m.compressedCertAlgs = append(m.compressedCertAlgs, algID) } + case extensionPadding: + // Padding bytes must be all zero. + for _, b := range body { + if b != 0 { + return false + } + } } if isGREASEValue(extension) { @@ -1067,7 +1091,7 @@ func (m *serverHelloMsg) unmarshal(data []byte) bool { } var extensions byteReader - if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 { + if !reader.readU16LengthPrefixed(&extensions) || len(reader) != 0 || !checkDuplicateExtensions(extensions) { return false } @@ -1330,6 +1354,10 @@ func (m *serverExtensions) unmarshal(data byteReader, version uint16) bool { // Reset all fields. *m = serverExtensions{} + if !checkDuplicateExtensions(data) { + return false + } + for len(data) > 0 { var extension uint16 var body byteReader @@ -1651,7 +1679,7 @@ func (m *certificateMsg) unmarshal(data []byte) bool { } if m.hasRequestContext { var extensions byteReader - if !certs.readU16LengthPrefixed(&extensions) { + if !certs.readU16LengthPrefixed(&extensions) || !checkDuplicateExtensions(extensions) { return false } for len(extensions) > 0 { @@ -2010,7 +2038,8 @@ func (m *certificateRequestMsg) unmarshal(data []byte) bool { var extensions byteReader if !reader.readU8LengthPrefixedBytes(&m.requestContext) || !reader.readU16LengthPrefixed(&extensions) || - len(reader) != 0 { + len(reader) != 0 || + !checkDuplicateExtensions(extensions) { return false } for len(extensions) > 0 { diff --git a/src/ssl/test/runner/handshake_server.go b/src/ssl/test/runner/handshake_server.go index 6a752421..5486342a 100644 --- a/src/ssl/test/runner/handshake_server.go +++ b/src/ssl/test/runner/handshake_server.go @@ -208,6 +208,26 @@ func (hs *serverHandshakeState) readClientHello() error { } } + if config.Bugs.FailIfCECPQ2Offered { + for _, offeredCurve := range hs.clientHello.supportedCurves { + if offeredCurve == CurveCECPQ2 { + return errors.New("tls: CECPQ2 was offered") + } + } + } + + if expected := config.Bugs.ExpectedKeyShares; expected != nil { + if len(expected) != len(hs.clientHello.keyShares) { + return fmt.Errorf("tls: expected %d key shares, but found %d", len(expected), len(hs.clientHello.keyShares)) + } + + for i, group := range expected { + if found := hs.clientHello.keyShares[i].group; found != group { + return fmt.Errorf("tls: key share #%d is for group %d, not %d", i, found, group) + } + } + } + c.clientVersion = hs.clientHello.vers // Use the versions extension if supplied, otherwise use the legacy ClientHello version. @@ -1212,6 +1232,11 @@ func (hs *serverHandshakeState) processClientHello() (isResume bool, err error) preferredCurves := config.curvePreferences() Curves: for _, curve := range hs.clientHello.supportedCurves { + if curve == CurveCECPQ2 && c.vers < VersionTLS13 { + // CECPQ2 is TLS 1.3-only. + continue + } + for _, supported := range preferredCurves { if supported == curve { supportedCurve = true @@ -1621,7 +1646,7 @@ func (hs *serverHandshakeState) doFullHandshake() error { } keyAgreement := hs.suite.ka(c.vers) - skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello) + skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello, c.vers) if err != nil { c.sendAlert(alertHandshakeFailure) return err diff --git a/src/ssl/test/runner/hrss/hrss.go b/src/ssl/test/runner/hrss/hrss.go new file mode 100644 index 00000000..9f4fdd77 --- /dev/null +++ b/src/ssl/test/runner/hrss/hrss.go @@ -0,0 +1,1212 @@ +package hrss + +import ( + "crypto/hmac" + "crypto/sha256" + "crypto/subtle" + "encoding/binary" + "io" + "math/bits" +) + +const ( + PublicKeySize = modQBytes + CiphertextSize = modQBytes +) + +const ( + N = 701 + Q = 8192 + mod3Bytes = 140 + modQBytes = 1138 +) + +const ( + bitsPerWord = bits.UintSize + wordsPerPoly = (N + bitsPerWord - 1) / bitsPerWord + fullWordsPerPoly = N / bitsPerWord + bitsInLastWord = N % bitsPerWord +) + +// poly3 represents a degree-N polynomial over GF(3). Each coefficient is +// bitsliced across the |s| and |a| arrays, like this: +// +// s | a | value +// ----------------- +// 0 | 0 | 0 +// 0 | 1 | 1 +// 1 | 0 | 2 (aka -1) +// 1 | 1 | <invalid> +// +// ('s' is for sign, and 'a' is just a letter.) +// +// Once bitsliced as such, the following circuits can be used to implement +// addition and multiplication mod 3: +// +// (s3, a3) = (s1, a1) × (s2, a2) +// s3 = (s2 ∧ a1) ⊕ (s1 ∧ a2) +// a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2) +// +// (s3, a3) = (s1, a1) + (s2, a2) +// t1 = ~(s1 ∨ a1) +// t2 = ~(s2 ∨ a2) +// s3 = (a1 ∧ a2) ⊕ (t1 ∧ s2) ⊕ (t2 ∧ s1) +// a3 = (s1 ∧ s2) ⊕ (t1 ∧ a2) ⊕ (t2 ∧ a1) +// +// Negating a value just involves swapping s and a. +type poly3 struct { + s [wordsPerPoly]uint + a [wordsPerPoly]uint +} + +func (p *poly3) trim() { + p.s[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1 + p.a[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1 +} + +func (p *poly3) zero() { + for i := range p.a { + p.s[i] = 0 + p.a[i] = 0 + } +} + +func (p *poly3) fromDiscrete(in *poly) { + var shift uint + s := p.s[:] + a := p.a[:] + s[0] = 0 + a[0] = 0 + + for _, v := range in { + s[0] >>= 1 + s[0] |= uint((v>>1)&1) << (bitsPerWord - 1) + a[0] >>= 1 + a[0] |= uint(v&1) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + s[0] = 0 + a[0] = 0 + shift = 0 + } + } + + a[0] >>= bitsPerWord - shift + s[0] >>= bitsPerWord - shift +} + +func (p *poly3) fromModQ(in *poly) int { + var shift uint + s := p.s[:] + a := p.a[:] + s[0] = 0 + a[0] = 0 + ok := 1 + + for _, v := range in { + vMod3, vOk := modQToMod3(v) + ok &= vOk + + s[0] >>= 1 + s[0] |= uint((vMod3>>1)&1) << (bitsPerWord - 1) + a[0] >>= 1 + a[0] |= uint(vMod3&1) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + s[0] = 0 + a[0] = 0 + shift = 0 + } + } + + a[0] >>= bitsPerWord - shift + s[0] >>= bitsPerWord - shift + + return ok +} + +func (p *poly3) fromDiscreteMod3(in *poly) { + var shift uint + s := p.s[:] + a := p.a[:] + s[0] = 0 + a[0] = 0 + + for _, v := range in { + // This duplicates the 13th bit upwards to the top of the + // uint16, essentially treating it as a sign bit and converting + // into a signed int16. The signed value is reduced mod 3, + // yeilding {-2, -1, 0, 1, 2}. + v = uint16((int16(v<<3)>>3)%3) & 7 + + // We want to map v thus: + // {-2, -1, 0, 1, 2} -> {1, 2, 0, 1, 2}. We take the bottom + // three bits and then the constants below, when shifted by + // those three bits, perform the required mapping. + s[0] >>= 1 + s[0] |= (0xbc >> v) << (bitsPerWord - 1) + a[0] >>= 1 + a[0] |= (0x7a >> v) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + s[0] = 0 + a[0] = 0 + shift = 0 + } + } + + a[0] >>= bitsPerWord - shift + s[0] >>= bitsPerWord - shift +} + +func (p *poly3) marshal(out []byte) { + s := p.s[:] + a := p.a[:] + sw := s[0] + aw := a[0] + var shift int + + for i := 0; i < 700; i += 5 { + acc, scale := 0, 1 + for j := 0; j < 5; j++ { + v := int(aw&1) | int(sw&1)<<1 + acc += scale * v + scale *= 3 + + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + sw = s[0] + aw = a[0] + shift = 0 + } else { + sw >>= 1 + aw >>= 1 + } + } + + out[0] = byte(acc) + out = out[1:] + } +} + +func (p *poly) fromMod2(in *poly2) { + var shift uint + words := in[:] + word := words[0] + + for i := range p { + p[i] = uint16(word & 1) + word >>= 1 + shift++ + if shift == bitsPerWord { + words = words[1:] + word = words[0] + shift = 0 + } + } +} + +func (p *poly) fromMod3(in *poly3) { + var shift uint + s := in.s[:] + a := in.a[:] + sw := s[0] + aw := a[0] + + for i := range p { + p[i] = uint16(aw&1 | (sw&1)<<1) + aw >>= 1 + sw >>= 1 + shift++ + if shift == bitsPerWord { + a = a[1:] + s = s[1:] + aw = a[0] + sw = s[0] + shift = 0 + } + } +} + +func (p *poly) fromMod3ToModQ(in *poly3) { + var shift uint + s := in.s[:] + a := in.a[:] + sw := s[0] + aw := a[0] + + for i := range p { + p[i] = mod3ToModQ(uint16(aw&1 | (sw&1)<<1)) + aw >>= 1 + sw >>= 1 + shift++ + if shift == bitsPerWord { + a = a[1:] + s = s[1:] + aw = a[0] + sw = s[0] + shift = 0 + } + } +} + +func lsbToAll(v uint) uint { + return uint(int(v<<(bitsPerWord-1)) >> (bitsPerWord - 1)) +} + +func (p *poly3) mulConst(ms, ma uint) { + ms = lsbToAll(ms) + ma = lsbToAll(ma) + + for i := range p.a { + p.s[i], p.a[i] = (ma&p.s[i])^(ms&p.a[i]), (ma&p.a[i])^(ms&p.s[i]) + } +} + +func cmovWords(out, in *[wordsPerPoly]uint, mov uint) { + for i := range out { + out[i] = (out[i] & ^mov) | (in[i] & mov) + } +} + +func rotWords(out, in *[wordsPerPoly]uint, bits uint) { + start := bits / bitsPerWord + n := (N - bits) / bitsPerWord + + for i := uint(0); i < n; i++ { + out[i] = in[start+i] + } + + carry := in[wordsPerPoly-1] + + for i := uint(0); i < start; i++ { + out[n+i] = carry | in[i]<<bitsInLastWord + carry = in[i] >> (bitsPerWord - bitsInLastWord) + } + + out[wordsPerPoly-1] = carry +} + +// rotBits right-rotates the bits in |in|. bits must be a non-zero power of two +// and less than bitsPerWord. +func rotBits(out, in *[wordsPerPoly]uint, bits uint) { + if (bits == 0 || (bits & (bits - 1)) != 0 || bits > bitsPerWord/2 || bitsInLastWord < bitsPerWord/2) { + panic("internal error"); + } + + carry := in[wordsPerPoly-1] << (bitsPerWord - bits) + + for i := wordsPerPoly - 2; i >= 0; i-- { + out[i] = carry | in[i]>>bits + carry = in[i] << (bitsPerWord - bits) + } + + out[wordsPerPoly-1] = carry>>(bitsPerWord-bitsInLastWord) | in[wordsPerPoly-1]>>bits +} + +func (p *poly3) rotWords(bits uint, in *poly3) { + rotWords(&p.s, &in.s, bits) + rotWords(&p.a, &in.a, bits) +} + +func (p *poly3) rotBits(bits uint, in *poly3) { + rotBits(&p.s, &in.s, bits) + rotBits(&p.a, &in.a, bits) +} + +func (p *poly3) cmov(in *poly3, mov uint) { + cmovWords(&p.s, &in.s, mov) + cmovWords(&p.a, &in.a, mov) +} + +func (p *poly3) rot(bits uint) { + if bits > N { + panic("invalid") + } + var shifted poly3 + + shift := uint(9) + for ; (1 << shift) >= bitsPerWord; shift-- { + shifted.rotWords(1<<shift, p) + p.cmov(&shifted, lsbToAll(bits>>shift)) + } + for ; shift < 9; shift-- { + shifted.rotBits(1<<shift, p) + p.cmov(&shifted, lsbToAll(bits>>shift)) + } +} + +func (p *poly3) fmadd(ms, ma uint, in *poly3) { + ms = lsbToAll(ms) + ma = lsbToAll(ma) + + for i := range p.a { + products := (ma & in.s[i]) ^ (ms & in.a[i]) + producta := (ma & in.a[i]) ^ (ms & in.s[i]) + + ns1Ana1 := ^p.s[i] & ^p.a[i] + ns2Ana2 := ^products & ^producta + + p.s[i], p.a[i] = (p.a[i]&producta)^(ns1Ana1&products)^(p.s[i]&ns2Ana2), (p.s[i]&products)^(ns1Ana1&producta)^(p.a[i]&ns2Ana2) + } +} + +func (p *poly3) modPhiN() { + factora := uint(int(p.s[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1)) + factors := uint(int(p.a[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1)) + ns2Ana2 := ^factors & ^factora + + for i := range p.s { + ns1Ana1 := ^p.s[i] & ^p.a[i] + p.s[i], p.a[i] = (p.a[i]&factora)^(ns1Ana1&factors)^(p.s[i]&ns2Ana2), (p.s[i]&factors)^(ns1Ana1&factora)^(p.a[i]&ns2Ana2) + } +} + +func (p *poly3) cswap(other *poly3, swap uint) { + for i := range p.s { + sums := swap & (p.s[i] ^ other.s[i]) + p.s[i] ^= sums + other.s[i] ^= sums + + suma := swap & (p.a[i] ^ other.a[i]) + p.a[i] ^= suma + other.a[i] ^= suma + } +} + +func (p *poly3) mulx() { + carrys := (p.s[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1 + carrya := (p.a[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1 + + for i := range p.s { + outCarrys := p.s[i] >> (bitsPerWord - 1) + outCarrya := p.a[i] >> (bitsPerWord - 1) + p.s[i] <<= 1 + p.a[i] <<= 1 + p.s[i] |= carrys + p.a[i] |= carrya + carrys = outCarrys + carrya = outCarrya + } +} + +func (p *poly3) divx() { + var carrys, carrya uint + + for i := len(p.s) - 1; i >= 0; i-- { + outCarrys := p.s[i] & 1 + outCarrya := p.a[i] & 1 + p.s[i] >>= 1 + p.a[i] >>= 1 + p.s[i] |= carrys << (bitsPerWord - 1) + p.a[i] |= carrya << (bitsPerWord - 1) + carrys = outCarrys + carrya = outCarrya + } +} + +type poly2 [wordsPerPoly]uint + +func (p *poly2) fromDiscrete(in *poly) { + var shift uint + words := p[:] + words[0] = 0 + + for _, v := range in { + words[0] >>= 1 + words[0] |= uint(v&1) << (bitsPerWord - 1) + shift++ + if shift == bitsPerWord { + words = words[1:] + words[0] = 0 + shift = 0 + } + } + + words[0] >>= bitsPerWord - shift +} + +func (p *poly2) setPhiN() { + for i := range p { + p[i] = ^uint(0) + } + p[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1 +} + +func (p *poly2) cswap(other *poly2, swap uint) { + for i := range p { + sum := swap & (p[i] ^ other[i]) + p[i] ^= sum + other[i] ^= sum + } +} + +func (p *poly2) fmadd(m uint, in *poly2) { + m = ^(m - 1) + + for i := range p { + p[i] ^= in[i] & m + } +} + +func (p *poly2) lshift1() { + var carry uint + for i := range p { + nextCarry := p[i] >> (bitsPerWord - 1) + p[i] <<= 1 + p[i] |= carry + carry = nextCarry + } +} + +func (p *poly2) rshift1() { + var carry uint + for i := len(p) - 1; i >= 0; i-- { + nextCarry := p[i] & 1 + p[i] >>= 1 + p[i] |= carry << (bitsPerWord - 1) + carry = nextCarry + } +} + +func (p *poly2) rot(bits uint) { + if bits > N { + panic("invalid") + } + var shifted [wordsPerPoly]uint + out := (*[wordsPerPoly]uint)(p) + + shift := uint(9) + for ; (1 << shift) >= bitsPerWord; shift-- { + rotWords(&shifted, out, 1<<shift) + cmovWords(out, &shifted, lsbToAll(bits>>shift)) + } + for ; shift < 9; shift-- { + rotBits(&shifted, out, 1<<shift) + cmovWords(out, &shifted, lsbToAll(bits>>shift)) + } +} + +type poly [N]uint16 + +func (in *poly) marshal(out []byte) { + p := in[:] + + for len(p) >= 8 { + out[0] = byte(p[0]) + out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5) + out[2] = byte(p[1] >> 3) + out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2) + out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7) + out[5] = byte(p[3] >> 1) + out[6] = byte(p[3]>>9) | byte((p[4]&0x0f)<<4) + out[7] = byte(p[4] >> 4) + out[8] = byte(p[4]>>12) | byte((p[5]&0x7f)<<1) + out[9] = byte(p[5]>>7) | byte((p[6]&0x03)<<6) + out[10] = byte(p[6] >> 2) + out[11] = byte(p[6]>>10) | byte((p[7]&0x1f)<<3) + out[12] = byte(p[7] >> 5) + + p = p[8:] + out = out[13:] + } + + // There are four remaining values. + out[0] = byte(p[0]) + out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5) + out[2] = byte(p[1] >> 3) + out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2) + out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7) + out[5] = byte(p[3] >> 1) + out[6] = byte(p[3] >> 9) +} + +func (out *poly) unmarshal(in []byte) bool { + p := out[:] + for i := 0; i < 87; i++ { + p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8 + p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11 + p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6 + p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9 + p[4] = uint16(in[6]>>4) | uint16(in[7])<<4 | uint16(in[8]&1)<<12 + p[5] = uint16(in[8]>>1) | uint16(in[9]&0x3f)<<7 + p[6] = uint16(in[9]>>6) | uint16(in[10])<<2 | uint16(in[11]&7)<<10 + p[7] = uint16(in[11]>>3) | uint16(in[12])<<5 + + p = p[8:] + in = in[13:] + } + + // There are four coefficients left over + p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8 + p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11 + p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6 + p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9 + + if in[6]&0xf0 != 0 { + return false + } + + out[N-1] = 0 + var top int + for _, v := range out { + top += int(v) + } + + out[N-1] = uint16(-top) % Q + return true +} + +func (in *poly) marshalS3(out []byte) { + p := in[:] + for len(p) >= 5 { + out[0] = byte(p[0] + p[1]*3 + p[2]*9 + p[3]*27 + p[4]*81) + out = out[1:] + p = p[5:] + } +} + +func (out *poly) unmarshalS3(in []byte) bool { + p := out[:] + for i := 0; i < 140; i++ { + c := in[0] + if c >= 243 { + return false + } + p[0] = uint16(c % 3) + p[1] = uint16((c / 3) % 3) + p[2] = uint16((c / 9) % 3) + p[3] = uint16((c / 27) % 3) + p[4] = uint16((c / 81) % 3) + + p = p[5:] + in = in[1:] + } + + out[N-1] = 0 + return true +} + +func (p *poly) modPhiN() { + for i := range p { + p[i] = (p[i] + Q - p[N-1]) % Q + } +} + +func (out *poly) shortSample(in []byte) { + // b a result + // 00 00 00 + // 00 01 01 + // 00 10 10 + // 00 11 11 + // 01 00 10 + // 01 01 00 + // 01 10 01 + // 01 11 11 + // 10 00 01 + // 10 01 10 + // 10 10 00 + // 10 11 11 + // 11 00 11 + // 11 01 11 + // 11 10 11 + // 11 11 11 + + // 1111 1111 1100 1001 1101 0010 1110 0100 + // f f c 9 d 2 e 4 + const lookup = uint32(0xffc9d2e4) + + p := out[:] + for i := 0; i < 87; i++ { + v := binary.LittleEndian.Uint32(in) + v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555) + for j := 0; j < 8; j++ { + p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3) + v2 >>= 4 + } + p = p[8:] + in = in[4:] + } + + // There are four values remaining. + v := binary.LittleEndian.Uint32(in) + v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555) + for j := 0; j < 4; j++ { + p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3) + v2 >>= 4 + } + + out[N-1] = 0 +} + +func (out *poly) shortSamplePlus(in []byte) { + out.shortSample(in) + + var sum uint16 + for i := 0; i < N-1; i++ { + sum += mod3ResultToModQ(out[i] * out[i+1]) + } + + scale := 1 + (1 & (sum >> 12)) + for i := 0; i < len(out); i += 2 { + out[i] = (out[i] * scale) % 3 + } +} + +func mul(out, scratch, a, b []uint16) { + const schoolbookLimit = 32 + if len(a) < schoolbookLimit { + for i := 0; i < len(a)*2; i++ { + out[i] = 0 + } + for i := range a { + for j := range b { + out[i+j] += a[i] * b[j] + } + } + return + } + + lowLen := len(a) / 2 + highLen := len(a) - lowLen + aLow, aHigh := a[:lowLen], a[lowLen:] + bLow, bHigh := b[:lowLen], b[lowLen:] + + for i := 0; i < lowLen; i++ { + out[i] = aHigh[i] + aLow[i] + } + if highLen != lowLen { + out[lowLen] = aHigh[lowLen] + } + + for i := 0; i < lowLen; i++ { + out[highLen+i] = bHigh[i] + bLow[i] + } + if highLen != lowLen { + out[highLen+lowLen] = bHigh[lowLen] + } + + mul(scratch, scratch[2*highLen:], out[:highLen], out[highLen:highLen*2]) + mul(out[lowLen*2:], scratch[2*highLen:], aHigh, bHigh) + mul(out, scratch[2*highLen:], aLow, bLow) + + for i := 0; i < lowLen*2; i++ { + scratch[i] -= out[i] + out[lowLen*2+i] + } + if lowLen != highLen { + scratch[lowLen*2] -= out[lowLen*4] + } + + for i := 0; i < 2*highLen; i++ { + out[lowLen+i] += scratch[i] + } +} + +func (out *poly) mul(a, b *poly) { + var prod, scratch [2 * N]uint16 + mul(prod[:], scratch[:], a[:], b[:]) + for i := range out { + out[i] = (prod[i] + prod[i+N]) % Q + } +} + +func (p3 *poly3) mulMod3(x, y *poly3) { + // (𝑥^n - 1) is a multiple of Φ(N) so we can work mod (𝑥^n - 1) here and + // (reduce mod Φ(N) afterwards. + x3 := *x + y3 := *y + s := x3.s[:] + a := x3.a[:] + sw := s[0] + aw := a[0] + p3.zero() + var shift uint + for i := 0; i < N; i++ { + p3.fmadd(sw, aw, &y3) + sw >>= 1 + aw >>= 1 + shift++ + if shift == bitsPerWord { + s = s[1:] + a = a[1:] + sw = s[0] + aw = a[0] + shift = 0 + } + y3.mulx() + } + p3.modPhiN() +} + +// mod3ToModQ maps {0, 1, 2, 3} to {0, 1, Q-1, 0xffff} +// The case of n == 3 should never happen but is included so that modQToMod3 +// can easily catch invalid inputs. +func mod3ToModQ(n uint16) uint16 { + return uint16(uint64(0xffff1fff00010000) >> (16 * n)) +} + +// modQToMod3 maps {0, 1, Q-1} to {(0, 0), (0, 1), (1, 0)} and also returns an int +// which is one if the input is in range and zero otherwise. +func modQToMod3(n uint16) (uint16, int) { + result := (n&3 - (n>>1)&1) + return result, subtle.ConstantTimeEq(int32(mod3ToModQ(result)), int32(n)) +} + +// mod3ResultToModQ maps {0, 1, 2, 4} to {0, 1, Q-1, 1} +func mod3ResultToModQ(n uint16) uint16 { + return ((((uint16(0x13) >> n) & 1) - 1) & 0x1fff) | ((uint16(0x12) >> n) & 1) + //shift := (uint(0x324) >> (2 * n)) & 3 + //return uint16(uint64(0x00011fff00010000) >> (16 * shift)) +} + +// mulXMinus1 sets out to a×(𝑥 - 1) mod (𝑥^n - 1) +func (out *poly) mulXMinus1() { + // Multiplying by (𝑥 - 1) means negating each coefficient and adding in + // the value of the previous one. + origOut700 := out[700] + + for i := N - 1; i > 0; i-- { + out[i] = (Q - out[i] + out[i-1]) % Q + } + out[0] = (Q - out[0] + origOut700) % Q +} + +func (out *poly) lift(a *poly) { + // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the + // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). + + // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: + // + // R.<x> = PolynomialRing(GF(3)…) + // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) + // list(inv)[:15] + // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] + // + // This three-element pattern of coefficients repeats for the whole + // polynomial. + // + // Next define the overbar operator such that z̅ = z[0] + + // reverse(z[1:]). (Index zero of a polynomial here is the coefficient + // of the constant term. So index one is the coefficient of 𝑥 and so + // on.) + // + // A less odd way to define this is to see that z̅ negates the indexes, + // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. + // + // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v, + // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum + // of the point-wise products.) Although we calculated the inverse mod + // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. + // (That's because (𝑥^N - 1) is a multiple of Φ(N).) + // + // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation + // of the list of coefficients. + // + // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: + // + // def reverse(xs): + // suffix = list(xs[1:]) + // suffix.reverse() + // return [xs[0]] + suffix + // + // def rotate(xs): + // return [xs[-1]] + xs[:-1] + // + // zoverbar = reverse(list(inv) + [0]) + // xzoverbar = rotate(reverse(list(inv) + [0])) + // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) + // + // zoverbar[:15] + // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] + // xzoverbar[:15] + // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] + // x2zoverbar[:15] + // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + // + // (For a formula for z̅, see lemma two of appendix B.) + // + // After the first three elements have been taken care of, all then have + // a repeating three-element cycle. The next value (𝑥^3z̅) involves + // three rotations of the first pattern, thus the three-element cycle + // lines up. However, the discontinuity in the first three elements + // obviously moves to a different position. Consider the difference + // between 𝑥^3z̅ and z̅: + // + // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] + // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // + // This pattern of differences is the same for all elements, although it + // obviously moves right with the rotations. + // + // From this, we reach algorithm eight of appendix B. + + // Handle the first three elements of the inner products. + out[0] = a[0] + a[2] + out[1] = a[1] + out[2] = 2*a[0] + a[2] + + // Use the repeating pattern to complete the first three inner products. + for i := 3; i < 699; i += 3 { + out[0] += 2*a[i] + a[i+2] + out[1] += a[i] + 2*a[i+1] + out[2] += a[i+1] + 2*a[i+2] + } + + // Handle the fact that the three-element pattern doesn't fill the + // polynomial exactly (since 701 isn't a multiple of three). + out[2] += a[700] + out[0] += 2 * a[699] + out[1] += a[699] + 2*a[700] + + out[0] = out[0] % 3 + out[1] = out[1] % 3 + out[2] = out[2] % 3 + + // Calculate the remaining inner products by taking advantage of the + // fact that the pattern repeats every three cycles and the pattern of + // differences is moves with the rotation. + for i := 3; i < N; i++ { + // Add twice something is the same as subtracting when working + // mod 3. Doing it this way avoids underflow. Underflow is bad + // because "% 3" doesn't work correctly for negative numbers + // here since underflow will wrap to 2^16-1 and 2^16 isn't a + // multiple of three. + out[i] = (out[i-3] + 2*(a[i-2]+a[i-1]+a[i])) % 3 + } + + // Reduce mod Φ(N) by subtracting a multiple of out[700] from every + // element and convert to mod Q. (See above about adding twice as + // subtraction.) + v := out[700] * 2 + for i := range out { + out[i] = mod3ToModQ((out[i] + v) % 3) + } + + out.mulXMinus1() +} + +func (a *poly) cswap(b *poly, swap uint16) { + for i := range a { + sum := swap & (a[i] ^ b[i]) + a[i] ^= sum + b[i] ^= sum + } +} + +func lt(a, b uint) uint { + if a < b { + return ^uint(0) + } + return 0 +} + +func bsMul(s1, a1, s2, a2 uint) (s3, a3 uint) { + s3 = (a1 & s2) ^ (s1 & a2) + a3 = (a1 & a2) ^ (s1 & s2) + return +} + +func (out *poly3) invertMod3(in *poly3) { + // This algorithm follows algorithm 10 in the paper. (Although note that + // the paper appears to have a bug: k should start at zero, not one.) + // The best explanation for why it works is in the "Why it works" + // section of + // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf. + var k uint + degF, degG := uint(N-1), uint(N-1) + + var b, c, g poly3 + f := *in + + for i := range g.a { + g.a[i] = ^uint(0) + } + + b.a[0] = 1 + + var f0s, f0a uint + stillGoing := ^uint(0) + for i := 0; i < 2*(N-1)-1; i++ { + ss, sa := bsMul(f.s[0], f.a[0], g.s[0], g.a[0]) + ss, sa = sa&stillGoing&1, ss&stillGoing&1 + shouldSwap := ^uint(int((ss|sa)-1)>>(bitsPerWord-1)) & lt(degF, degG) + f.cswap(&g, shouldSwap) + b.cswap(&c, shouldSwap) + degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap) + f.fmadd(ss, sa, &g) + b.fmadd(ss, sa, &c) + + f.divx() + f.s[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1 + f.a[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1 + c.mulx() + c.s[0] &= ^uint(1) + c.a[0] &= ^uint(1) + + degF-- + k += 1 & stillGoing + f0s = (stillGoing & f.s[0]) | (^stillGoing & f0s) + f0a = (stillGoing & f.a[0]) | (^stillGoing & f0a) + stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1)) + } + + k -= N & lt(N, k) + *out = b + out.rot(k) + out.mulConst(f0s, f0a) + out.modPhiN() +} + +func (out *poly) invertMod2(a *poly) { + // This algorithm follows mix of algorithm 10 in the paper and the first + // page of the PDF linked below. (Although note that the paper appears + // to have a bug: k should start at zero, not one.) The best explanation + // for why it works is in the "Why it works" section of + // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf. + var k uint + degF, degG := uint(N-1), uint(N-1) + + var f poly2 + f.fromDiscrete(a) + var b, c, g poly2 + g.setPhiN() + b[0] = 1 + + stillGoing := ^uint(0) + for i := 0; i < 2*(N-1)-1; i++ { + s := uint(f[0]&1) & stillGoing + shouldSwap := ^(s - 1) & lt(degF, degG) + f.cswap(&g, shouldSwap) + b.cswap(&c, shouldSwap) + degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap) + f.fmadd(s, &g) + b.fmadd(s, &c) + + f.rshift1() + c.lshift1() + + degF-- + k += 1 & stillGoing + stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1)) + } + + k -= N & lt(N, k) + b.rot(k) + out.fromMod2(&b) +} + +func (out *poly) invert(origA *poly) { + // Inversion mod Q, which is done based on the result of inverting mod + // 2. See the NTRU paper, page three. + var a, tmp, tmp2, b poly + b.invertMod2(origA) + + // Negate a. + for i := range a { + a[i] = Q - origA[i] + } + + // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) + // times, which is four. + for i := 0; i < 4; i++ { + tmp.mul(&a, &b) + tmp[0] += 2 + tmp2.mul(&b, &tmp) + b = tmp2 + } + + *out = b +} + +type PublicKey struct { + h poly +} + +func ParsePublicKey(in []byte) (*PublicKey, bool) { + ret := new(PublicKey) + if !ret.h.unmarshal(in) { + return nil, false + } + return ret, true +} + +func (pub *PublicKey) Marshal() []byte { + ret := make([]byte, modQBytes) + pub.h.marshal(ret) + return ret +} + +func (pub *PublicKey) Encap(rand io.Reader) (ciphertext []byte, sharedKey []byte) { + var randBytes [352 + 352]byte + if _, err := io.ReadFull(rand, randBytes[:]); err != nil { + panic("rand failed") + } + + var m, r poly + m.shortSample(randBytes[:352]) + r.shortSample(randBytes[352:]) + + var mBytes, rBytes [mod3Bytes]byte + m.marshalS3(mBytes[:]) + r.marshalS3(rBytes[:]) + + ciphertext = pub.owf(&m, &r) + + h := sha256.New() + h.Write([]byte("shared key\x00")) + h.Write(mBytes[:]) + h.Write(rBytes[:]) + h.Write(ciphertext) + sharedKey = h.Sum(nil) + + return ciphertext, sharedKey +} + +func (pub *PublicKey) owf(m, r *poly) []byte { + for i := range r { + r[i] = mod3ToModQ(r[i]) + } + + var mq poly + mq.lift(m) + + var e poly + e.mul(r, &pub.h) + for i := range e { + e[i] = (e[i] + mq[i]) % Q + } + + ret := make([]byte, modQBytes) + e.marshal(ret[:]) + return ret +} + +type PrivateKey struct { + PublicKey + f, fp poly3 + hInv poly + hmacKey [32]byte +} + +func (priv *PrivateKey) Marshal() []byte { + var ret [2*mod3Bytes + modQBytes]byte + priv.f.marshal(ret[:]) + priv.fp.marshal(ret[mod3Bytes:]) + priv.h.marshal(ret[2*mod3Bytes:]) + return ret[:] +} + +func (priv *PrivateKey) Decap(ciphertext []byte) (sharedKey []byte, ok bool) { + if len(ciphertext) != modQBytes { + return nil, false + } + + var e poly + if !e.unmarshal(ciphertext) { + return nil, false + } + + var f poly + f.fromMod3ToModQ(&priv.f) + + var v1, m poly + v1.mul(&e, &f) + + var v13 poly3 + v13.fromDiscreteMod3(&v1) + // Note: v13 is not reduced mod phi(n). + + var m3 poly3 + m3.mulMod3(&v13, &priv.fp) + m3.modPhiN() + m.fromMod3(&m3) + + var mLift, delta poly + mLift.lift(&m) + for i := range delta { + delta[i] = (e[i] - mLift[i] + Q) % Q + } + delta.mul(&delta, &priv.hInv) + delta.modPhiN() + + var r poly3 + allOk := r.fromModQ(&delta) + + var mBytes, rBytes [mod3Bytes]byte + m.marshalS3(mBytes[:]) + r.marshal(rBytes[:]) + + var rPoly poly + rPoly.fromMod3(&r) + expectedCiphertext := priv.PublicKey.owf(&m, &rPoly) + + allOk &= subtle.ConstantTimeCompare(ciphertext, expectedCiphertext) + + hmacHash := hmac.New(sha256.New, priv.hmacKey[:]) + hmacHash.Write(ciphertext) + hmacDigest := hmacHash.Sum(nil) + + h := sha256.New() + h.Write([]byte("shared key\x00")) + h.Write(mBytes[:]) + h.Write(rBytes[:]) + h.Write(ciphertext) + sharedKey = h.Sum(nil) + + mask := uint8(allOk - 1) + for i := range sharedKey { + sharedKey[i] = (sharedKey[i] & ^mask) | (hmacDigest[i] & mask) + } + + return sharedKey, true +} + +func GenerateKey(rand io.Reader) PrivateKey { + var randBytes [352 + 352]byte + if _, err := io.ReadFull(rand, randBytes[:]); err != nil { + panic("rand failed") + } + + var f poly + f.shortSamplePlus(randBytes[:352]) + var priv PrivateKey + priv.f.fromDiscrete(&f) + priv.fp.invertMod3(&priv.f) + + var g poly + g.shortSamplePlus(randBytes[352:]) + + var pgPhi1 poly + for i := range g { + pgPhi1[i] = mod3ToModQ(g[i]) + } + for i := range pgPhi1 { + pgPhi1[i] = (pgPhi1[i] * 3) % Q + } + pgPhi1.mulXMinus1() + + var fModQ poly + fModQ.fromMod3ToModQ(&priv.f) + + var pfgPhi1 poly + pfgPhi1.mul(&fModQ, &pgPhi1) + + var i poly + i.invert(&pfgPhi1) + + priv.h.mul(&i, &pgPhi1) + priv.h.mul(&priv.h, &pgPhi1) + + priv.hInv.mul(&i, &fModQ) + priv.hInv.mul(&priv.hInv, &fModQ) + + return priv +} diff --git a/src/ssl/test/runner/key_agreement.go b/src/ssl/test/runner/key_agreement.go index 791325cd..f40552d9 100644 --- a/src/ssl/test/runner/key_agreement.go +++ b/src/ssl/test/runner/key_agreement.go @@ -17,6 +17,7 @@ import ( "boringssl.googlesource.com/boringssl/ssl/test/runner/curve25519" "boringssl.googlesource.com/boringssl/ssl/test/runner/ed25519" + "boringssl.googlesource.com/boringssl/ssl/test/runner/hrss" ) type keyType int @@ -37,7 +38,7 @@ type rsaKeyAgreement struct { exportKey *rsa.PrivateKey } -func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { // Save the client version for comparison later. ka.clientVersion = clientHello.vers @@ -347,6 +348,90 @@ func (e *x25519ECDHCurve) finish(peerKey []byte) (preMasterSecret []byte, err er return out[:], nil } +// cecpq2Curve implements CECPQ2, which is HRSS+SXY combined with X25519. +type cecpq2Curve struct { + x25519PrivateKey [32]byte + hrssPrivateKey hrss.PrivateKey +} + +func (e *cecpq2Curve) offer(rand io.Reader) (publicKey []byte, err error) { + if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil { + return nil, err + } + + var x25519Public [32]byte + curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey) + + e.hrssPrivateKey = hrss.GenerateKey(rand) + hrssPublic := e.hrssPrivateKey.PublicKey.Marshal() + + var ret []byte + ret = append(ret, x25519Public[:]...) + ret = append(ret, hrssPublic...) + return ret, nil +} + +func (e *cecpq2Curve) accept(rand io.Reader, peerKey []byte) (publicKey []byte, preMasterSecret []byte, err error) { + if len(peerKey) != 32+hrss.PublicKeySize { + return nil, nil, errors.New("tls: bad length CECPQ2 offer") + } + + if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil { + return nil, nil, err + } + + var x25519Shared, x25519PeerKey, x25519Public [32]byte + copy(x25519PeerKey[:], peerKey) + curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey) + curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey) + + // Per RFC 7748, reject the all-zero value in constant time. + var zeros [32]byte + if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 { + return nil, nil, errors.New("tls: X25519 value with wrong order") + } + + hrssPublicKey, ok := hrss.ParsePublicKey(peerKey[32:]) + if !ok { + return nil, nil, errors.New("tls: bad CECPQ2 offer") + } + + hrssCiphertext, hrssShared := hrssPublicKey.Encap(rand) + + publicKey = append(publicKey, x25519Public[:]...) + publicKey = append(publicKey, hrssCiphertext...) + preMasterSecret = append(preMasterSecret, x25519Shared[:]...) + preMasterSecret = append(preMasterSecret, hrssShared...) + + return publicKey, preMasterSecret, nil +} + +func (e *cecpq2Curve) finish(peerKey []byte) (preMasterSecret []byte, err error) { + if len(peerKey) != 32+hrss.CiphertextSize { + return nil, errors.New("tls: bad length CECPQ2 reply") + } + + var x25519Shared, x25519PeerKey [32]byte + copy(x25519PeerKey[:], peerKey) + curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey) + + // Per RFC 7748, reject the all-zero value in constant time. + var zeros [32]byte + if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 { + return nil, errors.New("tls: X25519 value with wrong order") + } + + hrssShared, ok := e.hrssPrivateKey.Decap(peerKey[32:]) + if !ok { + return nil, errors.New("tls: invalid HRSS ciphertext") + } + + preMasterSecret = append(preMasterSecret, x25519Shared[:]...) + preMasterSecret = append(preMasterSecret, hrssShared...) + + return preMasterSecret, nil +} + func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) { switch id { case CurveP224: @@ -359,6 +444,8 @@ func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) { return &ellipticECDHCurve{curve: elliptic.P521(), sendCompressed: config.Bugs.SendCompressedCoordinates}, true case CurveX25519: return &x25519ECDHCurve{setHighBit: config.Bugs.SetX25519HighBit}, true + case CurveCECPQ2: + return &cecpq2Curve{}, true default: return nil, false } @@ -501,12 +588,17 @@ type ecdheKeyAgreement struct { peerKey []byte } -func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { var curveid CurveID preferredCurves := config.curvePreferences() NextCandidate: for _, candidate := range preferredCurves { + if candidate == CurveCECPQ2 && version < VersionTLS13 { + // CECPQ2 is TLS 1.3-only. + continue + } + for _, c := range clientHello.supportedCurves { if candidate == c { curveid = c @@ -614,7 +706,7 @@ func (ka *ecdheKeyAgreement) peerSignatureAlgorithm() signatureAlgorithm { // exchange. type nilKeyAgreement struct{} -func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { return nil, nil } @@ -666,7 +758,7 @@ type pskKeyAgreement struct { identityHint string } -func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) { +func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) { // Assemble the identity hint. bytes := make([]byte, 2+len(config.PreSharedKeyIdentity)) bytes[0] = byte(len(config.PreSharedKeyIdentity) >> 8) @@ -675,7 +767,7 @@ func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certi // If there is one, append the base key agreement's // ServerKeyExchange. - baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello) + baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello, version) if err != nil { return nil, err } diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go index fadc890f..b5cc0a79 100644 --- a/src/ssl/test/runner/runner.go +++ b/src/ssl/test/runner/runner.go @@ -22,6 +22,7 @@ import ( "crypto/x509" "crypto/x509/pkix" "encoding/base64" + "encoding/binary" "encoding/hex" "encoding/json" "encoding/pem" @@ -490,6 +491,9 @@ type testCase struct { // expectedQUICTransportParams contains the QUIC transport // parameters that are expected to be sent by the peer. expectedQUICTransportParams []byte + // exportTrafficSecrets, if true, configures the test to export the TLS 1.3 + // traffic secrets and confirms that they match. + exportTrafficSecrets bool } var testCases []testCase @@ -768,6 +772,32 @@ func doExchange(test *testCase, config *Config, conn net.Conn, isResume bool, tr } } + if test.exportTrafficSecrets { + secretLenBytes := make([]byte, 2) + if _, err := io.ReadFull(tlsConn, secretLenBytes); err != nil { + return err + } + secretLen := binary.LittleEndian.Uint16(secretLenBytes) + + theirReadSecret := make([]byte, secretLen) + theirWriteSecret := make([]byte, secretLen) + if _, err := io.ReadFull(tlsConn, theirReadSecret); err != nil { + return err + } + if _, err := io.ReadFull(tlsConn, theirWriteSecret); err != nil { + return err + } + + myReadSecret := tlsConn.in.trafficSecret + myWriteSecret := tlsConn.out.trafficSecret + if !bytes.Equal(myWriteSecret, theirReadSecret) { + return fmt.Errorf("read traffic-secret mismatch; got %x, wanted %x", theirReadSecret, myWriteSecret) + } + if !bytes.Equal(myReadSecret, theirWriteSecret) { + return fmt.Errorf("write traffic-secret mismatch; got %x, wanted %x", theirWriteSecret, myReadSecret) + } + } + if test.testTLSUnique { var peersValue [12]byte if _, err := io.ReadFull(tlsConn, peersValue[:]); err != nil { @@ -1123,6 +1153,10 @@ func runTest(test *testCase, shimPath string, mallocNumToFail int64) error { flags = append(flags, "-export-context", test.exportContext) } + if test.exportTrafficSecrets { + flags = append(flags, "-export-traffic-secrets") + } + if test.expectResumeRejected { flags = append(flags, "-expect-session-miss") } @@ -8862,7 +8896,7 @@ func addSignatureAlgorithmTests() { // Not all ciphers involve a signature. Advertise a list which gives all // versions a signing cipher. signingCiphers := []uint16{ - TLS_AES_128_GCM_SHA256, + TLS_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, @@ -9306,13 +9340,13 @@ func addSignatureAlgorithmTests() { expectedError: ":WRONG_SIGNATURE_TYPE:", }) - // Test that, if the list is missing, the peer falls back to SHA-1 in - // TLS 1.2, but not TLS 1.3. + // Test that, if the ClientHello list is missing, the server falls back + // to SHA-1 in TLS 1.2, but not TLS 1.3. testCases = append(testCases, testCase{ - name: "ClientAuth-SHA1-Fallback-RSA", + testType: serverTest, + name: "ServerAuth-SHA1-Fallback-RSA", config: Config{ MaxVersion: VersionTLS12, - ClientAuth: RequireAnyClientCert, VerifySignatureAlgorithms: []signatureAlgorithm{ signatureRSAPKCS1WithSHA1, }, @@ -9328,86 +9362,87 @@ func addSignatureAlgorithmTests() { testCases = append(testCases, testCase{ testType: serverTest, - name: "ServerAuth-SHA1-Fallback-RSA", + name: "ServerAuth-SHA1-Fallback-ECDSA", config: Config{ MaxVersion: VersionTLS12, VerifySignatureAlgorithms: []signatureAlgorithm{ - signatureRSAPKCS1WithSHA1, + signatureECDSAWithSHA1, }, Bugs: ProtocolBugs{ NoSignatureAlgorithms: true, }, }, flags: []string{ - "-cert-file", path.Join(*resourceDir, rsaCertificateFile), - "-key-file", path.Join(*resourceDir, rsaKeyFile), + "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile), + "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile), }, }) testCases = append(testCases, testCase{ - name: "ClientAuth-SHA1-Fallback-ECDSA", + testType: serverTest, + name: "ServerAuth-NoFallback-TLS13", config: Config{ - MaxVersion: VersionTLS12, - ClientAuth: RequireAnyClientCert, + MaxVersion: VersionTLS13, VerifySignatureAlgorithms: []signatureAlgorithm{ - signatureECDSAWithSHA1, + signatureRSAPKCS1WithSHA1, }, Bugs: ProtocolBugs{ NoSignatureAlgorithms: true, }, }, - flags: []string{ - "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile), - "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile), - }, + shouldFail: true, + expectedError: ":NO_COMMON_SIGNATURE_ALGORITHMS:", }) + // The CertificateRequest list, however, may never be omitted. It is a + // syntax error for it to be empty. testCases = append(testCases, testCase{ - testType: serverTest, - name: "ServerAuth-SHA1-Fallback-ECDSA", + name: "ClientAuth-NoFallback-RSA", config: Config{ MaxVersion: VersionTLS12, + ClientAuth: RequireAnyClientCert, VerifySignatureAlgorithms: []signatureAlgorithm{ - signatureECDSAWithSHA1, + signatureRSAPKCS1WithSHA1, }, Bugs: ProtocolBugs{ NoSignatureAlgorithms: true, }, }, flags: []string{ - "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile), - "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile), + "-cert-file", path.Join(*resourceDir, rsaCertificateFile), + "-key-file", path.Join(*resourceDir, rsaKeyFile), }, + shouldFail: true, + expectedError: ":DECODE_ERROR:", + expectedLocalError: "remote error: error decoding message", }) testCases = append(testCases, testCase{ - name: "ClientAuth-NoFallback-TLS13", + name: "ClientAuth-NoFallback-ECDSA", config: Config{ - MaxVersion: VersionTLS13, + MaxVersion: VersionTLS12, ClientAuth: RequireAnyClientCert, VerifySignatureAlgorithms: []signatureAlgorithm{ - signatureRSAPKCS1WithSHA1, + signatureECDSAWithSHA1, }, Bugs: ProtocolBugs{ NoSignatureAlgorithms: true, }, }, flags: []string{ - "-cert-file", path.Join(*resourceDir, rsaCertificateFile), - "-key-file", path.Join(*resourceDir, rsaKeyFile), + "-cert-file", path.Join(*resourceDir, ecdsaP256CertificateFile), + "-key-file", path.Join(*resourceDir, ecdsaP256KeyFile), }, - shouldFail: true, - // An empty CertificateRequest signature algorithm list is a - // syntax error in TLS 1.3. + shouldFail: true, expectedError: ":DECODE_ERROR:", expectedLocalError: "remote error: error decoding message", }) testCases = append(testCases, testCase{ - testType: serverTest, - name: "ServerAuth-NoFallback-TLS13", + name: "ClientAuth-NoFallback-TLS13", config: Config{ MaxVersion: VersionTLS13, + ClientAuth: RequireAnyClientCert, VerifySignatureAlgorithms: []signatureAlgorithm{ signatureRSAPKCS1WithSHA1, }, @@ -9415,8 +9450,13 @@ func addSignatureAlgorithmTests() { NoSignatureAlgorithms: true, }, }, - shouldFail: true, - expectedError: ":NO_COMMON_SIGNATURE_ALGORITHMS:", + flags: []string{ + "-cert-file", path.Join(*resourceDir, rsaCertificateFile), + "-key-file", path.Join(*resourceDir, rsaKeyFile), + }, + shouldFail: true, + expectedError: ":DECODE_ERROR:", + expectedLocalError: "remote error: error decoding message", }) // Test that signature preferences are enforced. BoringSSL does not @@ -9613,7 +9653,7 @@ func addSignatureAlgorithmTests() { CipherSuites: []uint16{TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256}, Certificates: []Certificate{ecdsaP256Certificate}, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":BAD_ECC_CERT:", }) @@ -9625,7 +9665,7 @@ func addSignatureAlgorithmTests() { MaxVersion: VersionTLS13, Certificates: []Certificate{ecdsaP256Certificate}, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, }) // In TLS 1.2, the ECDSA curve is not in the signature algorithm. @@ -10515,6 +10555,24 @@ func addExportKeyingMaterialTests() { }) } +func addExportTrafficSecretsTests() { + for _, cipherSuite := range []testCipherSuite{ + // Test a SHA-256 and SHA-384 based cipher suite. + {"AEAD-AES128-GCM-SHA256", TLS_AES_128_GCM_SHA256}, + {"AEAD-AES256-GCM-SHA384", TLS_AES_256_GCM_SHA384}, + } { + + testCases = append(testCases, testCase{ + name: "ExportTrafficSecrets-" + cipherSuite.name, + config: Config{ + MinVersion: VersionTLS13, + CipherSuites: []uint16{cipherSuite.id}, + }, + exportTrafficSecrets: true, + }) + } +} + func addTLSUniqueTests() { for _, isClient := range []bool{false, true} { for _, isResumption := range []bool{false, true} { @@ -10705,6 +10763,7 @@ var testCurves = []struct { {"P-384", CurveP384}, {"P-521", CurveP521}, {"X25519", CurveX25519}, + {"CECPQ2", CurveCECPQ2}, } const bogusCurve = 0x1234 @@ -10712,6 +10771,10 @@ const bogusCurve = 0x1234 func addCurveTests() { for _, curve := range testCurves { for _, ver := range tlsVersions { + if curve.id == CurveCECPQ2 && ver.version < VersionTLS13 { + continue + } + suffix := curve.name + "-" + ver.name testCases = append(testCases, testCase{ @@ -10721,7 +10784,7 @@ func addCurveTests() { CipherSuites: []uint16{ TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, - TLS_AES_128_GCM_SHA256, + TLS_AES_256_GCM_SHA384, }, CurvePreferences: []CurveID{curve.id}, }, @@ -10740,7 +10803,7 @@ func addCurveTests() { CipherSuites: []uint16{ TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, - TLS_AES_128_GCM_SHA256, + TLS_AES_256_GCM_SHA384, }, CurvePreferences: []CurveID{curve.id}, }, @@ -10752,7 +10815,7 @@ func addCurveTests() { expectedCurveID: curve.id, }) - if curve.id != CurveX25519 { + if curve.id != CurveX25519 && curve.id != CurveCECPQ2 { testCases = append(testCases, testCase{ name: "CurveTest-Client-Compressed-" + suffix, config: Config{ @@ -10760,7 +10823,7 @@ func addCurveTests() { CipherSuites: []uint16{ TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, - TLS_AES_128_GCM_SHA256, + TLS_AES_256_GCM_SHA384, }, CurvePreferences: []CurveID{curve.id}, Bugs: ProtocolBugs{ @@ -10780,7 +10843,7 @@ func addCurveTests() { CipherSuites: []uint16{ TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, - TLS_AES_128_GCM_SHA256, + TLS_AES_256_GCM_SHA384, }, CurvePreferences: []CurveID{curve.id}, Bugs: ProtocolBugs{ @@ -10896,7 +10959,7 @@ func addCurveTests() { IgnorePeerCurvePreferences: true, }, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":WRONG_CURVE:", }) @@ -10912,7 +10975,7 @@ func addCurveTests() { SendCurve: CurveP256, }, }, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":WRONG_CURVE:", }) @@ -11163,6 +11226,112 @@ func addCurveTests() { }, }, }) + + // CECPQ2 should not be offered by a TLS < 1.3 client. + testCases = append(testCases, testCase{ + name: "CECPQ2NotInTLS12", + config: Config{ + Bugs: ProtocolBugs{ + FailIfCECPQ2Offered: true, + }, + }, + flags: []string{ + "-max-version", strconv.Itoa(VersionTLS12), + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-curves", strconv.Itoa(int(CurveX25519)), + }, + }) + + // CECPQ2 should not crash a TLS < 1.3 client if the server mistakenly + // selects it. + testCases = append(testCases, testCase{ + name: "CECPQ2NotAcceptedByTLS12Client", + config: Config{ + Bugs: ProtocolBugs{ + SendCurve: CurveCECPQ2, + }, + }, + flags: []string{ + "-max-version", strconv.Itoa(VersionTLS12), + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-curves", strconv.Itoa(int(CurveX25519)), + }, + shouldFail: true, + expectedError: ":WRONG_CURVE:", + }) + + // CECPQ2 should not be offered by default as a client. + testCases = append(testCases, testCase{ + name: "CECPQ2NotEnabledByDefaultInClients", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + FailIfCECPQ2Offered: true, + }, + }, + }) + + // If CECPQ2 is offered, both X25519 and CECPQ2 should have a key-share. + testCases = append(testCases, testCase{ + name: "NotJustCECPQ2KeyShare", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + ExpectedKeyShares: []CurveID{CurveCECPQ2, CurveX25519}, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-curves", strconv.Itoa(int(CurveX25519)), + "-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)), + }, + }) + + // ... but only if CECPQ2 is listed first. + testCases = append(testCases, testCase{ + name: "CECPQ2KeyShareNotIncludedSecond", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + ExpectedKeyShares: []CurveID{CurveX25519}, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveX25519)), + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-expect-curve-id", strconv.Itoa(int(CurveX25519)), + }, + }) + + // If CECPQ2 is the only configured curve, the key share is sent. + testCases = append(testCases, testCase{ + name: "JustConfiguringCECPQ2Works", + config: Config{ + MinVersion: VersionTLS13, + Bugs: ProtocolBugs{ + ExpectedKeyShares: []CurveID{CurveCECPQ2}, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)), + }, + }) + + // As a server, CECPQ2 is not yet supported by default. + testCases = append(testCases, testCase{ + testType: serverTest, + name: "CECPQ2NotEnabledByDefaultForAServer", + config: Config{ + MinVersion: VersionTLS13, + CurvePreferences: []CurveID{CurveCECPQ2, CurveX25519}, + DefaultCurves: []CurveID{CurveCECPQ2}, + }, + flags: []string{ + "-server-preference", + "-expect-curve-id", strconv.Itoa(int(CurveX25519)), + }, + }) } func addTLS13RecordTests() { @@ -12700,7 +12869,7 @@ func addTLS13HandshakeTests() { }, }, tls13Variant: variant, - flags: []string{"-p384-only"}, + flags: []string{"-curves", strconv.Itoa(int(CurveP384))}, shouldFail: true, expectedError: ":WRONG_CURVE:", }) @@ -13859,6 +14028,60 @@ func addTLS13CipherPreferenceTests() { "-expect-cipher-no-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)), }, }) + + // Test that CECPQ2 cannot be used with TLS_AES_128_GCM_SHA256. + testCases = append(testCases, testCase{ + testType: serverTest, + name: "TLS13-CipherPreference-CECPQ2-AES128Only", + config: Config{ + MaxVersion: VersionTLS13, + CipherSuites: []uint16{ + TLS_AES_128_GCM_SHA256, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + }, + shouldFail: true, + expectedError: ":NO_SHARED_CIPHER:", + expectedLocalError: "remote error: handshake failure", + }) + + // Test that CECPQ2 continues to honor AES vs ChaCha20 logic. + testCases = append(testCases, testCase{ + testType: serverTest, + name: "TLS13-CipherPreference-CECPQ2-AES128-ChaCha20-AES256", + config: Config{ + MaxVersion: VersionTLS13, + CipherSuites: []uint16{ + TLS_AES_128_GCM_SHA256, + TLS_CHACHA20_POLY1305_SHA256, + TLS_AES_256_GCM_SHA384, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-expect-cipher-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)), + "-expect-cipher-no-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)), + }, + }) + testCases = append(testCases, testCase{ + testType: serverTest, + name: "TLS13-CipherPreference-CECPQ2-AES128-AES256-ChaCha20", + config: Config{ + MaxVersion: VersionTLS13, + CipherSuites: []uint16{ + TLS_AES_128_GCM_SHA256, + TLS_AES_256_GCM_SHA384, + TLS_CHACHA20_POLY1305_SHA256, + }, + }, + flags: []string{ + "-curves", strconv.Itoa(int(CurveCECPQ2)), + "-expect-cipher-aes", strconv.Itoa(int(TLS_AES_256_GCM_SHA384)), + "-expect-cipher-no-aes", strconv.Itoa(int(TLS_CHACHA20_POLY1305_SHA256)), + }, + }) } func addPeekTests() { @@ -14680,7 +14903,7 @@ func addJDK11WorkaroundTests() { }, { // The above with a padding extension added at the end. - decodeHexOrPanic("010001b4030336a379aa355a22a064b4402760efae1c73977b0b4c975efc7654c35677723dde201fe3f8a2bca60418a68f72463ea19f3c241e7cbfceb347e451a62bd2417d8981005a13011302c02cc02bc030009dc02ec032009f00a3c02f009cc02dc031009e00a2c024c028003dc026c02a006b006ac00ac0140035c005c00f00390038c023c027003cc025c02900670040c009c013002fc004c00e0033003200ff01000111000000080006000003736e69000500050100000000000a0020001e0017001800190009000a000b000c000d000e001601000101010201030104000b00020100000d002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020032002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020011000900070200040000000000170000002b0009080304030303020301002d000201010033004700450017004104721f007464cb08a0f36e093ad178eb78d6968df20077b2dd882694a85dc4c9884caf5092db41f16cc3f8d41f59426992fa5e32cfb9ad08deee752cdd95b1a6b50015000770616464696e67"), + decodeHexOrPanic("010001b4030336a379aa355a22a064b4402760efae1c73977b0b4c975efc7654c35677723dde201fe3f8a2bca60418a68f72463ea19f3c241e7cbfceb347e451a62bd2417d8981005a13011302c02cc02bc030009dc02ec032009f00a3c02f009cc02dc031009e00a2c024c028003dc026c02a006b006ac00ac0140035c005c00f00390038c023c027003cc025c02900670040c009c013002fc004c00e0033003200ff01000111000000080006000003736e69000500050100000000000a0020001e0017001800190009000a000b000c000d000e001601000101010201030104000b00020100000d002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020032002800260403050306030804080508060809080a080b04010501060104020303030103020203020102020011000900070200040000000000170000002b0009080304030303020301002d000201010033004700450017004104721f007464cb08a0f36e093ad178eb78d6968df20077b2dd882694a85dc4c9884caf5092db41f16cc3f8d41f59426992fa5e32cfb9ad08deee752cdd95b1a6b50015000700000000000000"), false, }, { @@ -14905,6 +15128,7 @@ func main() { addSignatureAlgorithmTests() addDTLSRetransmitTests() addExportKeyingMaterialTests() + addExportTrafficSecretsTests() addTLSUniqueTests() addCustomExtensionTests() addRSAClientKeyExchangeTests() diff --git a/src/ssl/test/test_config.cc b/src/ssl/test/test_config.cc index 7447d5ad..bed05010 100644 --- a/src/ssl/test/test_config.cc +++ b/src/ssl/test/test_config.cc @@ -104,7 +104,6 @@ const Flag<bool> kBoolFlags[] = { { "-renegotiate-ignore", &TestConfig::renegotiate_ignore }, { "-forbid-renegotiation-after-handshake", &TestConfig::forbid_renegotiation_after_handshake }, - { "-p384-only", &TestConfig::p384_only }, { "-enable-all-curves", &TestConfig::enable_all_curves }, { "-use-old-client-cert-callback", &TestConfig::use_old_client_cert_callback }, @@ -147,6 +146,8 @@ const Flag<bool> kBoolFlags[] = { { "-handshaker-resume", &TestConfig::handshaker_resume }, { "-reverify-on-resume", &TestConfig::reverify_on_resume }, { "-jdk11-workaround", &TestConfig::jdk11_workaround }, + { "-server-preference", &TestConfig::server_preference }, + { "-export-traffic-secrets", &TestConfig::export_traffic_secrets }, }; const Flag<std::string> kStringFlags[] = { @@ -220,10 +221,10 @@ const Flag<int> kIntFlags[] = { }; const Flag<std::vector<int>> kIntVectorFlags[] = { - { "-signing-prefs", &TestConfig::signing_prefs }, - { "-verify-prefs", &TestConfig::verify_prefs }, - { "-expect-peer-verify-pref", - &TestConfig::expected_peer_verify_prefs }, + {"-signing-prefs", &TestConfig::signing_prefs}, + {"-verify-prefs", &TestConfig::verify_prefs}, + {"-expect-peer-verify-pref", &TestConfig::expected_peer_verify_prefs}, + {"-curves", &TestConfig::curves}, }; bool ParseFlag(char *flag, int argc, char **argv, int *i, @@ -1294,7 +1295,6 @@ bssl::UniquePtr<SSL_CTX> TestConfig::SetupCtx(SSL_CTX *old_ctx) const { return nullptr; } - if (install_cert_compression_algs && (!SSL_CTX_add_cert_compression_alg( ssl_ctx.get(), 0xff02, @@ -1341,6 +1341,10 @@ bssl::UniquePtr<SSL_CTX> TestConfig::SetupCtx(SSL_CTX *old_ctx) const { abort(); } + if (server_preference) { + SSL_CTX_set_options(ssl_ctx.get(), SSL_OP_CIPHER_SERVER_PREFERENCE); + } + return ssl_ctx; } @@ -1589,16 +1593,43 @@ bssl::UniquePtr<SSL> TestConfig::NewSSL( if (!check_close_notify) { SSL_set_quiet_shutdown(ssl.get(), 1); } - if (p384_only) { - int nid = NID_secp384r1; - if (!SSL_set1_curves(ssl.get(), &nid, 1)) { - return nullptr; + if (!curves.empty()) { + std::vector<int> nids; + for (auto curve : curves) { + switch (curve) { + case SSL_CURVE_SECP224R1: + nids.push_back(NID_secp224r1); + break; + + case SSL_CURVE_SECP256R1: + nids.push_back(NID_X9_62_prime256v1); + break; + + case SSL_CURVE_SECP384R1: + nids.push_back(NID_secp384r1); + break; + + case SSL_CURVE_SECP521R1: + nids.push_back(NID_secp521r1); + break; + + case SSL_CURVE_X25519: + nids.push_back(NID_X25519); + break; + + case SSL_CURVE_CECPQ2: + nids.push_back(NID_CECPQ2); + break; + } + if (!SSL_set1_curves(ssl.get(), &nids[0], nids.size())) { + return nullptr; + } } } if (enable_all_curves) { static const int kAllCurves[] = { NID_secp224r1, NID_X9_62_prime256v1, NID_secp384r1, - NID_secp521r1, NID_X25519, + NID_secp521r1, NID_X25519, NID_CECPQ2, }; if (!SSL_set1_curves(ssl.get(), kAllCurves, OPENSSL_ARRAY_SIZE(kAllCurves))) { diff --git a/src/ssl/test/test_config.h b/src/ssl/test/test_config.h index bffe9118..0d0753e8 100644 --- a/src/ssl/test/test_config.h +++ b/src/ssl/test/test_config.h @@ -33,6 +33,7 @@ struct TestConfig { std::vector<int> signing_prefs; std::vector<int> verify_prefs; std::vector<int> expected_peer_verify_prefs; + std::vector<int> curves; std::string key_file; std::string cert_file; std::string expected_server_name; @@ -122,7 +123,6 @@ struct TestConfig { bool renegotiate_ignore = false; bool forbid_renegotiation_after_handshake = false; int expect_peer_signature_algorithm = 0; - bool p384_only = false; bool enable_all_curves = false; int expect_curve_id = 0; bool use_old_client_cert_callback = false; @@ -170,6 +170,8 @@ struct TestConfig { bool handshaker_resume = false; std::string handshaker_path; bool jdk11_workaround = false; + bool server_preference = false; + bool export_traffic_secrets = false; int argc; char **argv; diff --git a/src/ssl/tls13_client.cc b/src/ssl/tls13_client.cc index 0d3e8771..40913dcf 100644 --- a/src/ssl/tls13_client.cc +++ b/src/ssl/tls13_client.cc @@ -165,15 +165,17 @@ static enum ssl_hs_wait_t do_read_hello_retry_request(SSL_HANDSHAKE *hs) { return ssl_hs_error; } - // Check that the HelloRetryRequest does not request the key share that - // was provided in the initial ClientHello. - if (hs->key_share->GroupID() == group_id) { + // Check that the HelloRetryRequest does not request a key share that was + // provided in the initial ClientHello. + if (hs->key_shares[0]->GroupID() == group_id || + (hs->key_shares[1] && hs->key_shares[1]->GroupID() == group_id)) { ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER); OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE); return ssl_hs_error; } - hs->key_share.reset(); + hs->key_shares[0].reset(); + hs->key_shares[1].reset(); hs->retry_group = group_id; } @@ -506,7 +508,6 @@ static enum ssl_hs_wait_t do_read_certificate_request(SSL_HANDSHAKE *hs) { !have_sigalgs || !CBS_get_u16_length_prefixed(&sigalgs, &supported_signature_algorithms) || - CBS_len(&supported_signature_algorithms) == 0 || !tls1_parse_peer_sigalgs(hs, &supported_signature_algorithms)) { ssl_send_alert(ssl, SSL3_AL_FATAL, alert); OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR); diff --git a/src/ssl/tls13_server.cc b/src/ssl/tls13_server.cc index b4c4ca5a..7073b575 100644 --- a/src/ssl/tls13_server.cc +++ b/src/ssl/tls13_server.cc @@ -96,33 +96,39 @@ static int ssl_ext_supported_versions_add_serverhello(SSL_HANDSHAKE *hs, } static const SSL_CIPHER *choose_tls13_cipher( - const SSL *ssl, const SSL_CLIENT_HELLO *client_hello) { + const SSL *ssl, const SSL_CLIENT_HELLO *client_hello, uint16_t group_id) { if (client_hello->cipher_suites_len % 2 != 0) { - return NULL; + return nullptr; } CBS cipher_suites; CBS_init(&cipher_suites, client_hello->cipher_suites, client_hello->cipher_suites_len); - const int aes_is_fine = EVP_has_aes_hardware(); + const bool aes_is_fine = EVP_has_aes_hardware(); + const bool require_256_bit = group_id == SSL_CURVE_CECPQ2; const uint16_t version = ssl_protocol_version(ssl); - const SSL_CIPHER *best = NULL; + const SSL_CIPHER *best = nullptr; while (CBS_len(&cipher_suites) > 0) { uint16_t cipher_suite; if (!CBS_get_u16(&cipher_suites, &cipher_suite)) { - return NULL; + return nullptr; } // Limit to TLS 1.3 ciphers we know about. const SSL_CIPHER *candidate = SSL_get_cipher_by_value(cipher_suite); - if (candidate == NULL || + if (candidate == nullptr || SSL_CIPHER_get_min_version(candidate) > version || SSL_CIPHER_get_max_version(candidate) < version) { continue; } + // Post-quantum key exchanges should be paired with 256-bit ciphers. + if (require_256_bit && candidate->algorithm_enc == SSL_AES128GCM) { + continue; + } + // TLS 1.3 removes legacy ciphers, so honor the client order, but prefer // ChaCha20 if we do not have AES hardware. if (aes_is_fine) { @@ -133,7 +139,7 @@ static const SSL_CIPHER *choose_tls13_cipher( return candidate; } - if (best == NULL) { + if (best == nullptr) { best = candidate; } } @@ -240,8 +246,15 @@ static enum ssl_hs_wait_t do_select_parameters(SSL_HANDSHAKE *hs) { client_hello.session_id_len); hs->session_id_len = client_hello.session_id_len; + uint16_t group_id; + if (!tls1_get_shared_group(hs, &group_id)) { + OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_GROUP); + ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); + return ssl_hs_error; + } + // Negotiate the cipher suite. - hs->new_cipher = choose_tls13_cipher(ssl, &client_hello); + hs->new_cipher = choose_tls13_cipher(ssl, &client_hello, group_id); if (hs->new_cipher == NULL) { OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER); ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE); diff --git a/src/tool/speed.cc b/src/tool/speed.cc index 2175baa2..975fb531 100644 --- a/src/tool/speed.cc +++ b/src/tool/speed.cc @@ -32,6 +32,7 @@ #include <openssl/ecdsa.h> #include <openssl/ec_key.h> #include <openssl/evp.h> +#include <openssl/hrss.h> #include <openssl/nid.h> #include <openssl/rand.h> #include <openssl/rsa.h> @@ -744,6 +745,61 @@ static bool SpeedScrypt(const std::string &selected) { return true; } +static bool SpeedHRSS(const std::string &selected) { + if (!selected.empty() && selected != "HRSS") { + return true; + } + + TimeResults results; + + if (!TimeFunction(&results, []() -> bool { + struct HRSS_public_key pub; + struct HRSS_private_key priv; + uint8_t entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(entropy, sizeof(entropy)); + HRSS_generate_key(&pub, &priv, entropy); + return true; + })) { + fprintf(stderr, "Failed to time HRSS_generate_key.\n"); + return false; + } + + results.Print("HRSS generate"); + + struct HRSS_public_key pub; + struct HRSS_private_key priv; + uint8_t key_entropy[HRSS_GENERATE_KEY_BYTES]; + RAND_bytes(key_entropy, sizeof(key_entropy)); + HRSS_generate_key(&pub, &priv, key_entropy); + + uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES]; + if (!TimeFunction(&results, [&pub, &ciphertext]() -> bool { + uint8_t entropy[HRSS_ENCAP_BYTES]; + uint8_t shared_key[HRSS_KEY_BYTES]; + RAND_bytes(entropy, sizeof(entropy)); + HRSS_encap(ciphertext, shared_key, &pub, entropy); + return true; + })) { + fprintf(stderr, "Failed to time HRSS_encap.\n"); + return false; + } + + results.Print("HRSS encap"); + + if (!TimeFunction(&results, [&pub, &priv, &ciphertext]() -> bool { + uint8_t shared_key[HRSS_KEY_BYTES]; + HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext)); + return true; + })) { + fprintf(stderr, "Failed to time HRSS_encap.\n"); + return false; + } + + results.Print("HRSS decap"); + + return true; +} + static const struct argument kArguments[] = { { "-filter", kOptionalArgument, @@ -817,7 +873,8 @@ bool Speed(const std::vector<std::string> &args) { !Speed25519(selected) || !SpeedSPAKE2(selected) || !SpeedScrypt(selected) || - !SpeedRSAKeyGen(selected)) { + !SpeedRSAKeyGen(selected) || + !SpeedHRSS(selected)) { return false; } diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py index 9c635dcf..2a6fe3f2 100644 --- a/src/util/generate_build_files.py +++ b/src/util/generate_build_files.py @@ -44,6 +44,9 @@ NON_PERL_FILES = { 'src/crypto/curve25519/asm/x25519-asm-arm.S', 'src/crypto/poly1305/poly1305_arm_asm.S', ], + ('linux', 'x86_64'): [ + 'src/crypto/hrss/asm/poly_rq_mul.S', + ], } PREFIX = None diff --git a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm index 7dc0c5ac..923c9fa9 100644 --- a/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm +++ b/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm @@ -1609,1020 +1609,6 @@ $L$ctr_enc_epilogue: DB 0F3h,0C3h ;repret -global bsaes_xts_encrypt - -ALIGN 16 -bsaes_xts_encrypt: - - mov rax,rsp -$L$xts_enc_prologue: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-72))+rsp] - - mov r10,QWORD[160+rsp] - mov r11,QWORD[168+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$xts_enc_body: - mov rbp,rsp - - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - - lea rcx,[r11] - lea rdx,[32+rbp] - lea r8,[r10] - call aes_nohw_encrypt - - mov eax,DWORD[240+r15] - mov rbx,r14 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,xmm6 - movdqa XMMWORD[rax],xmm7 - - and r14,-16 - sub rsp,0x80 - movdqa xmm6,XMMWORD[32+rbp] - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - - sub r14,0x80 - jc NEAR $L$xts_enc_short - jmp NEAR $L$xts_enc_loop - -ALIGN 16 -$L$xts_enc_loop: - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - movdqa XMMWORD[112+rsp],xmm6 - pxor xmm5,xmm13 - lea rax,[128+rsp] - pxor xmm6,xmm14 - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,XMMWORD[112+rsp] - movdqu XMMWORD[96+r13],xmm1 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - - movdqa xmm6,XMMWORD[112+rsp] - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - - sub r14,0x80 - jnc NEAR $L$xts_enc_loop - -$L$xts_enc_short: - add r14,0x80 - jz NEAR $L$xts_enc_done - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - cmp r14,16 - je NEAR $L$xts_enc_1 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - cmp r14,32 - je NEAR $L$xts_enc_2 - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - cmp r14,48 - je NEAR $L$xts_enc_3 - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - cmp r14,64 - je NEAR $L$xts_enc_4 - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - cmp r14,80 - je NEAR $L$xts_enc_5 - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - cmp r14,96 - je NEAR $L$xts_enc_6 - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqa XMMWORD[112+rsp],xmm6 - lea r12,[112+r12] - pxor xmm5,xmm13 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - pxor xmm1,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm1 - lea r13,[112+r13] - - movdqa xmm6,XMMWORD[112+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_6: - pxor xmm3,xmm11 - lea r12,[96+r12] - pxor xmm4,xmm12 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm2 - movdqu XMMWORD[80+r13],xmm6 - lea r13,[96+r13] - - movdqa xmm6,XMMWORD[96+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_5: - pxor xmm2,xmm10 - lea r12,[80+r12] - pxor xmm3,xmm11 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - pxor xmm2,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm5 - movdqu XMMWORD[64+r13],xmm2 - lea r13,[80+r13] - - movdqa xmm6,XMMWORD[80+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_4: - pxor xmm1,xmm9 - lea r12,[64+r12] - pxor xmm2,xmm10 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm5,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm3 - movdqu XMMWORD[48+r13],xmm5 - lea r13,[64+r13] - - movdqa xmm6,XMMWORD[64+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_3: - pxor xmm0,xmm8 - lea r12,[48+r12] - pxor xmm1,xmm9 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm3,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm3 - lea r13,[48+r13] - - movdqa xmm6,XMMWORD[48+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_2: - pxor xmm15,xmm7 - lea r12,[32+r12] - pxor xmm0,xmm8 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_encrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - lea r13,[32+r13] - - movdqa xmm6,XMMWORD[32+rsp] - jmp NEAR $L$xts_enc_done -ALIGN 16 -$L$xts_enc_1: - pxor xmm7,xmm15 - lea r12,[16+r12] - movdqa XMMWORD[32+rbp],xmm7 - lea rcx,[32+rbp] - lea rdx,[32+rbp] - lea r8,[r15] - call aes_nohw_encrypt - pxor xmm15,XMMWORD[32+rbp] - - - - - - movdqu XMMWORD[r13],xmm15 - lea r13,[16+r13] - - movdqa xmm6,XMMWORD[16+rsp] - -$L$xts_enc_done: - and ebx,15 - jz NEAR $L$xts_enc_ret - mov rdx,r13 - -$L$xts_enc_steal: - movzx eax,BYTE[r12] - movzx ecx,BYTE[((-16))+rdx] - lea r12,[1+r12] - mov BYTE[((-16))+rdx],al - mov BYTE[rdx],cl - lea rdx,[1+rdx] - sub ebx,1 - jnz NEAR $L$xts_enc_steal - - movdqu xmm15,XMMWORD[((-16))+r13] - lea rcx,[32+rbp] - pxor xmm15,xmm6 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call aes_nohw_encrypt - pxor xmm6,XMMWORD[32+rbp] - movdqu XMMWORD[(-16)+r13],xmm6 - -$L$xts_enc_ret: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$xts_enc_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$xts_enc_bzero - - lea rax,[120+rbp] - - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$xts_enc_tail: - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbx,QWORD[((-16))+rax] - - mov rbp,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$xts_enc_epilogue: - DB 0F3h,0C3h ;repret - - - -global bsaes_xts_decrypt - -ALIGN 16 -bsaes_xts_decrypt: - - mov rax,rsp -$L$xts_dec_prologue: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,[((-72))+rsp] - - mov r10,QWORD[160+rsp] - mov r11,QWORD[168+rsp] - lea rsp,[((-160))+rsp] - movaps XMMWORD[64+rsp],xmm6 - movaps XMMWORD[80+rsp],xmm7 - movaps XMMWORD[96+rsp],xmm8 - movaps XMMWORD[112+rsp],xmm9 - movaps XMMWORD[128+rsp],xmm10 - movaps XMMWORD[144+rsp],xmm11 - movaps XMMWORD[160+rsp],xmm12 - movaps XMMWORD[176+rsp],xmm13 - movaps XMMWORD[192+rsp],xmm14 - movaps XMMWORD[208+rsp],xmm15 -$L$xts_dec_body: - mov rbp,rsp - mov r12,rcx - mov r13,rdx - mov r14,r8 - mov r15,r9 - - lea rcx,[r11] - lea rdx,[32+rbp] - lea r8,[r10] - call aes_nohw_encrypt - - mov eax,DWORD[240+r15] - mov rbx,r14 - - mov edx,eax - shl rax,7 - sub rax,96 - sub rsp,rax - - mov rax,rsp - mov rcx,r15 - mov r10d,edx - call _bsaes_key_convert - pxor xmm7,XMMWORD[rsp] - movdqa XMMWORD[rax],xmm6 - movdqa XMMWORD[rsp],xmm7 - - xor eax,eax - and r14,-16 - test ebx,15 - setnz al - shl rax,4 - sub r14,rax - - sub rsp,0x80 - movdqa xmm6,XMMWORD[32+rbp] - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - - sub r14,0x80 - jc NEAR $L$xts_dec_short - jmp NEAR $L$xts_dec_loop - -ALIGN 16 -$L$xts_dec_loop: - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqu xmm14,XMMWORD[112+r12] - lea r12,[128+r12] - movdqa XMMWORD[112+rsp],xmm6 - pxor xmm5,xmm13 - lea rax,[128+rsp] - pxor xmm6,xmm14 - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - pxor xmm2,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - pxor xmm4,XMMWORD[112+rsp] - movdqu XMMWORD[96+r13],xmm2 - movdqu XMMWORD[112+r13],xmm4 - lea r13,[128+r13] - - movdqa xmm6,XMMWORD[112+rsp] - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - - sub r14,0x80 - jnc NEAR $L$xts_dec_loop - -$L$xts_dec_short: - add r14,0x80 - jz NEAR $L$xts_dec_done - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm15,xmm6 - movdqa XMMWORD[rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm0,xmm6 - movdqa XMMWORD[16+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm7,XMMWORD[r12] - cmp r14,16 - je NEAR $L$xts_dec_1 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm1,xmm6 - movdqa XMMWORD[32+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm8,XMMWORD[16+r12] - cmp r14,32 - je NEAR $L$xts_dec_2 - pxor xmm15,xmm7 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm2,xmm6 - movdqa XMMWORD[48+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm9,XMMWORD[32+r12] - cmp r14,48 - je NEAR $L$xts_dec_3 - pxor xmm0,xmm8 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm3,xmm6 - movdqa XMMWORD[64+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm10,XMMWORD[48+r12] - cmp r14,64 - je NEAR $L$xts_dec_4 - pxor xmm1,xmm9 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm4,xmm6 - movdqa XMMWORD[80+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm11,XMMWORD[64+r12] - cmp r14,80 - je NEAR $L$xts_dec_5 - pxor xmm2,xmm10 - pshufd xmm13,xmm14,0x13 - pxor xmm14,xmm14 - movdqa xmm5,xmm6 - movdqa XMMWORD[96+rsp],xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - pcmpgtd xmm14,xmm6 - pxor xmm6,xmm13 - movdqu xmm12,XMMWORD[80+r12] - cmp r14,96 - je NEAR $L$xts_dec_6 - pxor xmm3,xmm11 - movdqu xmm13,XMMWORD[96+r12] - pxor xmm4,xmm12 - movdqa XMMWORD[112+rsp],xmm6 - lea r12,[112+r12] - pxor xmm5,xmm13 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - pxor xmm2,XMMWORD[96+rsp] - movdqu XMMWORD[80+r13],xmm6 - movdqu XMMWORD[96+r13],xmm2 - lea r13,[112+r13] - - movdqa xmm6,XMMWORD[112+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_6: - pxor xmm3,xmm11 - lea r12,[96+r12] - pxor xmm4,xmm12 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - pxor xmm6,XMMWORD[80+rsp] - movdqu XMMWORD[64+r13],xmm1 - movdqu XMMWORD[80+r13],xmm6 - lea r13,[96+r13] - - movdqa xmm6,XMMWORD[96+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_5: - pxor xmm2,xmm10 - lea r12,[80+r12] - pxor xmm3,xmm11 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - pxor xmm1,XMMWORD[64+rsp] - movdqu XMMWORD[48+r13],xmm3 - movdqu XMMWORD[64+r13],xmm1 - lea r13,[80+r13] - - movdqa xmm6,XMMWORD[80+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_4: - pxor xmm1,xmm9 - lea r12,[64+r12] - pxor xmm2,xmm10 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - pxor xmm3,XMMWORD[48+rsp] - movdqu XMMWORD[32+r13],xmm5 - movdqu XMMWORD[48+r13],xmm3 - lea r13,[64+r13] - - movdqa xmm6,XMMWORD[64+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_3: - pxor xmm0,xmm8 - lea r12,[48+r12] - pxor xmm1,xmm9 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - pxor xmm5,XMMWORD[32+rsp] - movdqu XMMWORD[16+r13],xmm0 - movdqu XMMWORD[32+r13],xmm5 - lea r13,[48+r13] - - movdqa xmm6,XMMWORD[48+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_2: - pxor xmm15,xmm7 - lea r12,[32+r12] - pxor xmm0,xmm8 - lea rax,[128+rsp] - mov r10d,edx - - call _bsaes_decrypt8 - - pxor xmm15,XMMWORD[rsp] - pxor xmm0,XMMWORD[16+rsp] - movdqu XMMWORD[r13],xmm15 - movdqu XMMWORD[16+r13],xmm0 - lea r13,[32+r13] - - movdqa xmm6,XMMWORD[32+rsp] - jmp NEAR $L$xts_dec_done -ALIGN 16 -$L$xts_dec_1: - pxor xmm7,xmm15 - lea r12,[16+r12] - movdqa XMMWORD[32+rbp],xmm7 - lea rcx,[32+rbp] - lea rdx,[32+rbp] - lea r8,[r15] - call aes_nohw_decrypt - pxor xmm15,XMMWORD[32+rbp] - - - - - - movdqu XMMWORD[r13],xmm15 - lea r13,[16+r13] - - movdqa xmm6,XMMWORD[16+rsp] - -$L$xts_dec_done: - and ebx,15 - jz NEAR $L$xts_dec_ret - - pxor xmm14,xmm14 - movdqa xmm12,XMMWORD[$L$xts_magic] - pcmpgtd xmm14,xmm6 - pshufd xmm13,xmm14,0x13 - movdqa xmm5,xmm6 - paddq xmm6,xmm6 - pand xmm13,xmm12 - movdqu xmm15,XMMWORD[r12] - pxor xmm6,xmm13 - - lea rcx,[32+rbp] - pxor xmm15,xmm6 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call aes_nohw_decrypt - pxor xmm6,XMMWORD[32+rbp] - mov rdx,r13 - movdqu XMMWORD[r13],xmm6 - -$L$xts_dec_steal: - movzx eax,BYTE[16+r12] - movzx ecx,BYTE[rdx] - lea r12,[1+r12] - mov BYTE[rdx],al - mov BYTE[16+rdx],cl - lea rdx,[1+rdx] - sub ebx,1 - jnz NEAR $L$xts_dec_steal - - movdqu xmm15,XMMWORD[r13] - lea rcx,[32+rbp] - pxor xmm15,xmm5 - lea rdx,[32+rbp] - movdqa XMMWORD[32+rbp],xmm15 - lea r8,[r15] - call aes_nohw_decrypt - pxor xmm5,XMMWORD[32+rbp] - movdqu XMMWORD[r13],xmm5 - -$L$xts_dec_ret: - lea rax,[rsp] - pxor xmm0,xmm0 -$L$xts_dec_bzero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - lea rax,[32+rax] - cmp rbp,rax - ja NEAR $L$xts_dec_bzero - - lea rax,[120+rbp] - - movaps xmm6,XMMWORD[64+rbp] - movaps xmm7,XMMWORD[80+rbp] - movaps xmm8,XMMWORD[96+rbp] - movaps xmm9,XMMWORD[112+rbp] - movaps xmm10,XMMWORD[128+rbp] - movaps xmm11,XMMWORD[144+rbp] - movaps xmm12,XMMWORD[160+rbp] - movaps xmm13,XMMWORD[176+rbp] - movaps xmm14,XMMWORD[192+rbp] - movaps xmm15,XMMWORD[208+rbp] - lea rax,[160+rax] -$L$xts_dec_tail: - mov r15,QWORD[((-48))+rax] - - mov r14,QWORD[((-40))+rax] - - mov r13,QWORD[((-32))+rax] - - mov r12,QWORD[((-24))+rax] - - mov rbx,QWORD[((-16))+rax] - - mov rbp,QWORD[((-8))+rax] - - lea rsp,[rax] - -$L$xts_dec_epilogue: - DB 0F3h,0C3h ;repret - - ALIGN 64 _bsaes_const: @@ -2786,14 +1772,6 @@ ALIGN 4 DD $L$ctr_enc_epilogue wrt ..imagebase DD $L$ctr_enc_info wrt ..imagebase - DD $L$xts_enc_prologue wrt ..imagebase - DD $L$xts_enc_epilogue wrt ..imagebase - DD $L$xts_enc_info wrt ..imagebase - - DD $L$xts_dec_prologue wrt ..imagebase - DD $L$xts_dec_epilogue wrt ..imagebase - DD $L$xts_dec_info wrt ..imagebase - section .xdata rdata align=8 ALIGN 8 $L$cbc_dec_info: @@ -2808,15 +1786,3 @@ DB 9,0,0,0 DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase DD $L$ctr_enc_tail wrt ..imagebase DD 0 -$L$xts_enc_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase - DD $L$xts_enc_tail wrt ..imagebase - DD 0 -$L$xts_dec_info: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase - DD $L$xts_dec_tail wrt ..imagebase - DD 0 diff --git a/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm b/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm index ea6c4f17..33dc2c2e 100644 --- a/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm +++ b/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm @@ -31,8 +31,6 @@ $L$SEH_begin_sha512_block_data_order: mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] - test r10d,2048 - jnz NEAR $L$xop_shortcut and r9d,1073741824 and r10d,268435968 or r10d,r9d @@ -1839,1130 +1837,6 @@ DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 ALIGN 64 -sha512_block_data_order_xop: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_sha512_block_data_order_xop: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - -$L$xop_shortcut: - mov rax,rsp - - push rbx - - push rbp - - push r12 - - push r13 - - push r14 - - push r15 - - shl rdx,4 - sub rsp,256 - lea rdx,[rdx*8+rsi] - and rsp,-64 - mov QWORD[((128+0))+rsp],rdi - mov QWORD[((128+8))+rsp],rsi - mov QWORD[((128+16))+rsp],rdx - mov QWORD[152+rsp],rax - - movaps XMMWORD[(128+32)+rsp],xmm6 - movaps XMMWORD[(128+48)+rsp],xmm7 - movaps XMMWORD[(128+64)+rsp],xmm8 - movaps XMMWORD[(128+80)+rsp],xmm9 - movaps XMMWORD[(128+96)+rsp],xmm10 - movaps XMMWORD[(128+112)+rsp],xmm11 -$L$prologue_xop: - - vzeroupper - mov rax,QWORD[rdi] - mov rbx,QWORD[8+rdi] - mov rcx,QWORD[16+rdi] - mov rdx,QWORD[24+rdi] - mov r8,QWORD[32+rdi] - mov r9,QWORD[40+rdi] - mov r10,QWORD[48+rdi] - mov r11,QWORD[56+rdi] - jmp NEAR $L$loop_xop -ALIGN 16 -$L$loop_xop: - vmovdqa xmm11,XMMWORD[((K512+1280))] - vmovdqu xmm0,XMMWORD[rsi] - lea rbp,[((K512+128))] - vmovdqu xmm1,XMMWORD[16+rsi] - vmovdqu xmm2,XMMWORD[32+rsi] - vpshufb xmm0,xmm0,xmm11 - vmovdqu xmm3,XMMWORD[48+rsi] - vpshufb xmm1,xmm1,xmm11 - vmovdqu xmm4,XMMWORD[64+rsi] - vpshufb xmm2,xmm2,xmm11 - vmovdqu xmm5,XMMWORD[80+rsi] - vpshufb xmm3,xmm3,xmm11 - vmovdqu xmm6,XMMWORD[96+rsi] - vpshufb xmm4,xmm4,xmm11 - vmovdqu xmm7,XMMWORD[112+rsi] - vpshufb xmm5,xmm5,xmm11 - vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] - vpshufb xmm6,xmm6,xmm11 - vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] - vpshufb xmm7,xmm7,xmm11 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] - vmovdqa XMMWORD[rsp],xmm8 - vpaddq xmm8,xmm4,XMMWORD[rbp] - vmovdqa XMMWORD[16+rsp],xmm9 - vpaddq xmm9,xmm5,XMMWORD[32+rbp] - vmovdqa XMMWORD[32+rsp],xmm10 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - vmovdqa XMMWORD[48+rsp],xmm11 - vpaddq xmm11,xmm7,XMMWORD[96+rbp] - vmovdqa XMMWORD[64+rsp],xmm8 - mov r14,rax - vmovdqa XMMWORD[80+rsp],xmm9 - mov rdi,rbx - vmovdqa XMMWORD[96+rsp],xmm10 - xor rdi,rcx - vmovdqa XMMWORD[112+rsp],xmm11 - mov r13,r8 - jmp NEAR $L$xop_00_47 - -ALIGN 16 -$L$xop_00_47: - add rbp,256 - vpalignr xmm8,xmm1,xmm0,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm5,xmm4,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm0,xmm0,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,223,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm7,6 - add rdx,r11 - add r11,rdi - vpaddq xmm0,xmm0,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm0,xmm0,xmm11 - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[rsp],xmm10 - vpalignr xmm8,xmm2,xmm1,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm6,xmm5,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm1,xmm1,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,216,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm0,6 - add rbx,r9 - add r9,rdi - vpaddq xmm1,xmm1,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm1,xmm1,xmm11 - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[16+rsp],xmm10 - vpalignr xmm8,xmm3,xmm2,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm7,xmm6,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm2,xmm2,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,217,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm1,6 - add r11,rdx - add rdx,rdi - vpaddq xmm2,xmm2,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm2,xmm2,xmm11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[32+rsp],xmm10 - vpalignr xmm8,xmm4,xmm3,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm0,xmm7,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm3,xmm3,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,218,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm2,6 - add r9,rbx - add rbx,rdi - vpaddq xmm3,xmm3,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm3,xmm3,xmm11 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[48+rsp],xmm10 - vpalignr xmm8,xmm5,xmm4,8 - ror r13,23 - mov rax,r14 - vpalignr xmm11,xmm1,xmm0,8 - mov r12,r9 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r8 - xor r12,r10 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rax - vpaddq xmm4,xmm4,xmm11 - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax -DB 143,72,120,195,209,7 - xor r12,r10 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,219,3 - xor r14,rax - add r11,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rbx - ror r14,28 - vpsrlq xmm10,xmm3,6 - add rdx,r11 - add r11,rdi - vpaddq xmm4,xmm4,xmm8 - mov r13,rdx - add r14,r11 -DB 143,72,120,195,203,42 - ror r13,23 - mov r11,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - vpaddq xmm4,xmm4,xmm11 - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - vpaddq xmm10,xmm4,XMMWORD[rbp] - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - vmovdqa XMMWORD[64+rsp],xmm10 - vpalignr xmm8,xmm6,xmm5,8 - ror r13,23 - mov r10,r14 - vpalignr xmm11,xmm2,xmm1,8 - mov r12,rdx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rcx - xor r12,r8 - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r10 - vpaddq xmm5,xmm5,xmm11 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 -DB 143,72,120,195,209,7 - xor r12,r8 - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,220,3 - xor r14,r10 - add r9,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r11 - ror r14,28 - vpsrlq xmm10,xmm4,6 - add rbx,r9 - add r9,rdi - vpaddq xmm5,xmm5,xmm8 - mov r13,rbx - add r14,r9 -DB 143,72,120,195,203,42 - ror r13,23 - mov r9,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - vpaddq xmm5,xmm5,xmm11 - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - vpaddq xmm10,xmm5,XMMWORD[32+rbp] - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - vmovdqa XMMWORD[80+rsp],xmm10 - vpalignr xmm8,xmm7,xmm6,8 - ror r13,23 - mov r8,r14 - vpalignr xmm11,xmm3,xmm2,8 - mov r12,rbx - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,rax - xor r12,rcx - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,r8 - vpaddq xmm6,xmm6,xmm11 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 -DB 143,72,120,195,209,7 - xor r12,rcx - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,221,3 - xor r14,r8 - add rdx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,r9 - ror r14,28 - vpsrlq xmm10,xmm5,6 - add r11,rdx - add rdx,rdi - vpaddq xmm6,xmm6,xmm8 - mov r13,r11 - add r14,rdx -DB 143,72,120,195,203,42 - ror r13,23 - mov rdx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - vpaddq xmm6,xmm6,xmm11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - vpaddq xmm10,xmm6,XMMWORD[64+rbp] - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - vmovdqa XMMWORD[96+rsp],xmm10 - vpalignr xmm8,xmm0,xmm7,8 - ror r13,23 - mov rcx,r14 - vpalignr xmm11,xmm4,xmm3,8 - mov r12,r11 - ror r14,5 -DB 143,72,120,195,200,56 - xor r13,r10 - xor r12,rax - vpsrlq xmm8,xmm8,7 - ror r13,4 - xor r14,rcx - vpaddq xmm7,xmm7,xmm11 - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx -DB 143,72,120,195,209,7 - xor r12,rax - ror r14,6 - vpxor xmm8,xmm8,xmm9 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 -DB 143,104,120,195,222,3 - xor r14,rcx - add rbx,r13 - vpxor xmm8,xmm8,xmm10 - xor rdi,rdx - ror r14,28 - vpsrlq xmm10,xmm6,6 - add r9,rbx - add rbx,rdi - vpaddq xmm7,xmm7,xmm8 - mov r13,r9 - add r14,rbx -DB 143,72,120,195,203,42 - ror r13,23 - mov rbx,r14 - vpxor xmm11,xmm11,xmm10 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - vpxor xmm11,xmm11,xmm9 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - vpaddq xmm7,xmm7,xmm11 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - vpaddq xmm10,xmm7,XMMWORD[96+rbp] - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - vmovdqa XMMWORD[112+rsp],xmm10 - cmp BYTE[135+rbp],0 - jne NEAR $L$xop_00_47 - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[8+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[16+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[24+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[32+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[40+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[48+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[56+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - ror r13,23 - mov rax,r14 - mov r12,r9 - ror r14,5 - xor r13,r8 - xor r12,r10 - ror r13,4 - xor r14,rax - and r12,r8 - xor r13,r8 - add r11,QWORD[64+rsp] - mov r15,rax - xor r12,r10 - ror r14,6 - xor r15,rbx - add r11,r12 - ror r13,14 - and rdi,r15 - xor r14,rax - add r11,r13 - xor rdi,rbx - ror r14,28 - add rdx,r11 - add r11,rdi - mov r13,rdx - add r14,r11 - ror r13,23 - mov r11,r14 - mov r12,r8 - ror r14,5 - xor r13,rdx - xor r12,r9 - ror r13,4 - xor r14,r11 - and r12,rdx - xor r13,rdx - add r10,QWORD[72+rsp] - mov rdi,r11 - xor r12,r9 - ror r14,6 - xor rdi,rax - add r10,r12 - ror r13,14 - and r15,rdi - xor r14,r11 - add r10,r13 - xor r15,rax - ror r14,28 - add rcx,r10 - add r10,r15 - mov r13,rcx - add r14,r10 - ror r13,23 - mov r10,r14 - mov r12,rdx - ror r14,5 - xor r13,rcx - xor r12,r8 - ror r13,4 - xor r14,r10 - and r12,rcx - xor r13,rcx - add r9,QWORD[80+rsp] - mov r15,r10 - xor r12,r8 - ror r14,6 - xor r15,r11 - add r9,r12 - ror r13,14 - and rdi,r15 - xor r14,r10 - add r9,r13 - xor rdi,r11 - ror r14,28 - add rbx,r9 - add r9,rdi - mov r13,rbx - add r14,r9 - ror r13,23 - mov r9,r14 - mov r12,rcx - ror r14,5 - xor r13,rbx - xor r12,rdx - ror r13,4 - xor r14,r9 - and r12,rbx - xor r13,rbx - add r8,QWORD[88+rsp] - mov rdi,r9 - xor r12,rdx - ror r14,6 - xor rdi,r10 - add r8,r12 - ror r13,14 - and r15,rdi - xor r14,r9 - add r8,r13 - xor r15,r10 - ror r14,28 - add rax,r8 - add r8,r15 - mov r13,rax - add r14,r8 - ror r13,23 - mov r8,r14 - mov r12,rbx - ror r14,5 - xor r13,rax - xor r12,rcx - ror r13,4 - xor r14,r8 - and r12,rax - xor r13,rax - add rdx,QWORD[96+rsp] - mov r15,r8 - xor r12,rcx - ror r14,6 - xor r15,r9 - add rdx,r12 - ror r13,14 - and rdi,r15 - xor r14,r8 - add rdx,r13 - xor rdi,r9 - ror r14,28 - add r11,rdx - add rdx,rdi - mov r13,r11 - add r14,rdx - ror r13,23 - mov rdx,r14 - mov r12,rax - ror r14,5 - xor r13,r11 - xor r12,rbx - ror r13,4 - xor r14,rdx - and r12,r11 - xor r13,r11 - add rcx,QWORD[104+rsp] - mov rdi,rdx - xor r12,rbx - ror r14,6 - xor rdi,r8 - add rcx,r12 - ror r13,14 - and r15,rdi - xor r14,rdx - add rcx,r13 - xor r15,r8 - ror r14,28 - add r10,rcx - add rcx,r15 - mov r13,r10 - add r14,rcx - ror r13,23 - mov rcx,r14 - mov r12,r11 - ror r14,5 - xor r13,r10 - xor r12,rax - ror r13,4 - xor r14,rcx - and r12,r10 - xor r13,r10 - add rbx,QWORD[112+rsp] - mov r15,rcx - xor r12,rax - ror r14,6 - xor r15,rdx - add rbx,r12 - ror r13,14 - and rdi,r15 - xor r14,rcx - add rbx,r13 - xor rdi,rdx - ror r14,28 - add r9,rbx - add rbx,rdi - mov r13,r9 - add r14,rbx - ror r13,23 - mov rbx,r14 - mov r12,r10 - ror r14,5 - xor r13,r9 - xor r12,r11 - ror r13,4 - xor r14,rbx - and r12,r9 - xor r13,r9 - add rax,QWORD[120+rsp] - mov rdi,rbx - xor r12,r11 - ror r14,6 - xor rdi,rcx - add rax,r12 - ror r13,14 - and r15,rdi - xor r14,rbx - add rax,r13 - xor r15,rcx - ror r14,28 - add r8,rax - add rax,r15 - mov r13,r8 - add r14,rax - mov rdi,QWORD[((128+0))+rsp] - mov rax,r14 - - add rax,QWORD[rdi] - lea rsi,[128+rsi] - add rbx,QWORD[8+rdi] - add rcx,QWORD[16+rdi] - add rdx,QWORD[24+rdi] - add r8,QWORD[32+rdi] - add r9,QWORD[40+rdi] - add r10,QWORD[48+rdi] - add r11,QWORD[56+rdi] - - cmp rsi,QWORD[((128+16))+rsp] - - mov QWORD[rdi],rax - mov QWORD[8+rdi],rbx - mov QWORD[16+rdi],rcx - mov QWORD[24+rdi],rdx - mov QWORD[32+rdi],r8 - mov QWORD[40+rdi],r9 - mov QWORD[48+rdi],r10 - mov QWORD[56+rdi],r11 - jb NEAR $L$loop_xop - - mov rsi,QWORD[152+rsp] - - vzeroupper - movaps xmm6,XMMWORD[((128+32))+rsp] - movaps xmm7,XMMWORD[((128+48))+rsp] - movaps xmm8,XMMWORD[((128+64))+rsp] - movaps xmm9,XMMWORD[((128+80))+rsp] - movaps xmm10,XMMWORD[((128+96))+rsp] - movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[((-48))+rsi] - - mov r14,QWORD[((-40))+rsi] - - mov r13,QWORD[((-32))+rsi] - - mov r12,QWORD[((-24))+rsi] - - mov rbp,QWORD[((-16))+rsi] - - mov rbx,QWORD[((-8))+rsi] - - lea rsp,[rsi] - -$L$epilogue_xop: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret - -$L$SEH_end_sha512_block_data_order_xop: - -ALIGN 64 sha512_block_data_order_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi @@ -4250,9 +3124,6 @@ ALIGN 4 DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase DD $L$SEH_end_sha512_block_data_order wrt ..imagebase DD $L$SEH_info_sha512_block_data_order wrt ..imagebase - DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase - DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase @@ -4262,10 +3133,6 @@ $L$SEH_info_sha512_block_data_order: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase -$L$SEH_info_sha512_block_data_order_xop: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase $L$SEH_info_sha512_block_data_order_avx: DB 9,0,0,0 DD se_handler wrt ..imagebase |