summaryrefslogtreecommitdiff
path: root/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl')
-rw-r--r--src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl299
1 files changed, 299 insertions, 0 deletions
diff --git a/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl b/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
index 47d9972f..9429344b 100644
--- a/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
+++ b/src/crypto/fipsmodule/aes/asm/vpaes-x86_64.pl
@@ -176,6 +176,181 @@ _vpaes_encrypt_core:
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
##
+## _aes_encrypt_core_2x
+##
+## AES-encrypt %xmm0 and %xmm6 in parallel.
+##
+## Inputs:
+## %xmm0 and %xmm6 = input
+## %xmm12-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0 and %xmm6
+## Clobbers %xmm1-%xmm5, %xmm7-%xmm11, %r9, %r10, %r11, %rax
+## Preserves %xmm14 and %xmm15
+##
+## This function stitches two parallel instances of _vpaes_encrypt_core. x86_64
+## provides 16 XMM registers. _vpaes_encrypt_core computes over six registers
+## (%xmm0-%xmm5) and additionally uses seven registers with preloaded constants
+## from _vpaes_preheat (%xmm9-%xmm15). This does not quite fit two instances,
+## so we spill some of %xmm9 through %xmm15 back to memory. We keep %xmm9 and
+## %xmm10 in registers as these values are used several times in a row. The
+## remainder are read once per round and are spilled to memory. This leaves two
+## registers preserved for the caller.
+##
+## Thus, of the two _vpaes_encrypt_core instances, the first uses (%xmm0-%xmm5)
+## as before. The second uses %xmm6-%xmm8,%xmm11-%xmm13. (Add 6 to %xmm2 and
+## below. Add 8 to %xmm3 and up.) Instructions in the second instance are
+## indented by one space.
+##
+##
+.type _vpaes_encrypt_core_2x,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core_2x:
+.cfi_startproc
+ mov %rdx, %r9
+ mov \$16, %r11
+ mov 240(%rdx),%eax
+ movdqa %xmm9, %xmm1
+ movdqa %xmm9, %xmm7
+ movdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ movdqa %xmm2, %xmm8
+ pandn %xmm0, %xmm1
+ pandn %xmm6, %xmm7
+ movdqu (%r9), %xmm5 # round0 key
+ # Also use %xmm5 in the second instance.
+ psrld \$4, %xmm1
+ psrld \$4, %xmm7
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm6
+ pshufb %xmm0, %xmm2
+ pshufb %xmm6, %xmm8
+ movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
+ movdqa %xmm0, %xmm6
+ pshufb %xmm1, %xmm0
+ pshufb %xmm7, %xmm6
+ pxor %xmm5, %xmm2
+ pxor %xmm5, %xmm8
+ add \$16, %r9
+ pxor %xmm2, %xmm0
+ pxor %xmm8, %xmm6
+ lea .Lk_mc_backward(%rip),%r10
+ jmp .Lenc2x_entry
+
+.align 16
+.Lenc2x_loop:
+ # middle of middle round
+ movdqa .Lk_sb1(%rip), %xmm4 # 4 : sb1u
+ movdqa .Lk_sb1+16(%rip),%xmm0 # 0 : sb1t
+ movdqa %xmm4, %xmm12
+ movdqa %xmm0, %xmm6
+ pshufb %xmm2, %xmm4 # 4 = sb1u
+ pshufb %xmm8, %xmm12
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pshufb %xmm11, %xmm6
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ pxor %xmm5, %xmm12
+ movdqa .Lk_sb2(%rip), %xmm5 # 4 : sb2u
+ movdqa %xmm5, %xmm13
+ pxor %xmm4, %xmm0 # 0 = A
+ pxor %xmm12, %xmm6
+ movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ # Also use %xmm1 in the second instance.
+ pshufb %xmm2, %xmm5 # 4 = sb2u
+ pshufb %xmm8, %xmm13
+ movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ # Also use %xmm4 in the second instance.
+ movdqa .Lk_sb2+16(%rip), %xmm2 # 2 : sb2t
+ movdqa %xmm2, %xmm8
+ pshufb %xmm3, %xmm2 # 2 = sb2t
+ pshufb %xmm11, %xmm8
+ movdqa %xmm0, %xmm3 # 3 = A
+ movdqa %xmm6, %xmm11
+ pxor %xmm5, %xmm2 # 2 = 2A
+ pxor %xmm13, %xmm8
+ pshufb %xmm1, %xmm0 # 0 = B
+ pshufb %xmm1, %xmm6
+ add \$16, %r9 # next key
+ pxor %xmm2, %xmm0 # 0 = 2A+B
+ pxor %xmm8, %xmm6
+ pshufb %xmm4, %xmm3 # 3 = D
+ pshufb %xmm4, %xmm11
+ add \$16, %r11 # next mc
+ pxor %xmm0, %xmm3 # 3 = 2A+B+D
+ pxor %xmm6, %xmm11
+ pshufb %xmm1, %xmm0 # 0 = 2B+C
+ pshufb %xmm1, %xmm6
+ and \$0x30, %r11 # ... mod 4
+ sub \$1,%rax # nr--
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
+ pxor %xmm11, %xmm6
+
+.Lenc2x_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ movdqa %xmm9, %xmm7
+ movdqa .Lk_inv+16(%rip), %xmm5 # 2 : a/k
+ movdqa %xmm5, %xmm13
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ pandn %xmm6, %xmm7
+ psrld \$4, %xmm1 # 1 = i
+ psrld \$4, %xmm7
+ pand %xmm9, %xmm0 # 0 = k
+ pand %xmm9, %xmm6
+ pshufb %xmm0, %xmm5 # 2 = a/k
+ pshufb %xmm6, %xmm13
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ movdqa %xmm10, %xmm11
+ pxor %xmm1, %xmm0 # 0 = j
+ pxor %xmm7, %xmm6
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pshufb %xmm7, %xmm11
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ movdqa %xmm10, %xmm12
+ pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
+ pxor %xmm13, %xmm11
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pshufb %xmm6, %xmm12
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ movdqa %xmm10, %xmm8
+ pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
+ pxor %xmm13, %xmm12
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pshufb %xmm11, %xmm8
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ movdqa %xmm10, %xmm11
+ pxor %xmm0, %xmm2 # 2 = io
+ pxor %xmm6, %xmm8
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pshufb %xmm12, %xmm11
+ movdqu (%r9), %xmm5
+ # Also use %xmm5 in the second instance.
+ pxor %xmm1, %xmm3 # 3 = jo
+ pxor %xmm7, %xmm11
+ jnz .Lenc2x_loop
+
+ # middle of last round
+ movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ movdqa %xmm4, %xmm12
+ movdqa %xmm0, %xmm6
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pshufb %xmm8, %xmm12
+ pxor %xmm5, %xmm4 # 4 = sb1u + k
+ pxor %xmm5, %xmm12
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pshufb %xmm11, %xmm6
+ movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ # Also use %xmm1 in the second instance.
+ pxor %xmm4, %xmm0 # 0 = A
+ pxor %xmm12, %xmm6
+ pshufb %xmm1, %xmm0
+ pshufb %xmm1, %xmm6
+ ret
+.cfi_endproc
+.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+##
## Decryption core
##
## Same API as encryption core.
@@ -984,6 +1159,111 @@ $code.=<<___;
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx","%r8");
+# void vpaes_ctr32_encrypt_blocks(const uint8_t *inp, uint8_t *out,
+# size_t blocks, const AES_KEY *key,
+# const uint8_t ivp[16]);
+$code.=<<___;
+.globl ${PREFIX}_ctr32_encrypt_blocks
+.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5
+.align 16
+${PREFIX}_ctr32_encrypt_blocks:
+.cfi_startproc
+ # _vpaes_encrypt_core and _vpaes_encrypt_core_2x expect the key in %rdx.
+ xchg $key, $blocks
+___
+($blocks,$key)=($key,$blocks);
+$code.=<<___;
+ test $blocks, $blocks
+ jz .Lctr32_abort
+___
+$code.=<<___ if ($win64);
+ lea -0xb8(%rsp),%rsp
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+ movdqu ($ivp), %xmm0 # Load IV.
+ movdqa .Lctr_add_one(%rip), %xmm8
+ sub $inp, $out # This allows only incrementing $inp.
+ call _vpaes_preheat
+ movdqa %xmm0, %xmm6
+ pshufb .Lrev_ctr(%rip), %xmm6
+
+ test \$1, $blocks
+ jz .Lctr32_prep_loop
+
+ # Handle one block so the remaining block count is even for
+ # _vpaes_encrypt_core_2x.
+ movdqu ($inp), %xmm7 # Load input.
+ call _vpaes_encrypt_core
+ pxor %xmm7, %xmm0
+ paddd %xmm8, %xmm6
+ movdqu %xmm0, ($out,$inp)
+ sub \$1, $blocks
+ lea 16($inp), $inp
+ jz .Lctr32_done
+
+.Lctr32_prep_loop:
+ # _vpaes_encrypt_core_2x leaves only %xmm14 and %xmm15 as spare
+ # registers. We maintain two byte-swapped counters in them.
+ movdqa %xmm6, %xmm14
+ movdqa %xmm6, %xmm15
+ paddd %xmm8, %xmm15
+
+.Lctr32_loop:
+ movdqa .Lrev_ctr(%rip), %xmm1 # Set up counters.
+ movdqa %xmm14, %xmm0
+ movdqa %xmm15, %xmm6
+ pshufb %xmm1, %xmm0
+ pshufb %xmm1, %xmm6
+ call _vpaes_encrypt_core_2x
+ movdqu ($inp), %xmm1 # Load input.
+ movdqu 16($inp), %xmm2
+ movdqa .Lctr_add_two(%rip), %xmm3
+ pxor %xmm1, %xmm0 # XOR input.
+ pxor %xmm2, %xmm6
+ paddd %xmm3, %xmm14 # Increment counters.
+ paddd %xmm3, %xmm15
+ movdqu %xmm0, ($out,$inp) # Write output.
+ movdqu %xmm6, 16($out,$inp)
+ sub \$2, $blocks # Advance loop.
+ lea 32($inp), $inp
+ jnz .Lctr32_loop
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+ movaps 0x10(%rsp),%xmm6
+ movaps 0x20(%rsp),%xmm7
+ movaps 0x30(%rsp),%xmm8
+ movaps 0x40(%rsp),%xmm9
+ movaps 0x50(%rsp),%xmm10
+ movaps 0x60(%rsp),%xmm11
+ movaps 0x70(%rsp),%xmm12
+ movaps 0x80(%rsp),%xmm13
+ movaps 0x90(%rsp),%xmm14
+ movaps 0xa0(%rsp),%xmm15
+ lea 0xb8(%rsp),%rsp
+.Lctr32_epilogue:
+___
+$code.=<<___;
+.Lctr32_abort:
+ ret
+.cfi_endproc
+.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
+___
+}
$code.=<<___;
##
## _aes_preheat
@@ -1107,6 +1387,17 @@ _vpaes_consts:
.Lk_dsbo: # decryption sbox final output
.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+# .Lrev_ctr is a permutation which byte-swaps the counter portion of the IV.
+.Lrev_ctr:
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+# .Lctr_add_* may be added to a byte-swapped xmm register to increment the
+# counter. The register must be byte-swapped again to form the actual input.
+.Lctr_add_one:
+ .quad 0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+ .quad 0x0000000000000000, 0x0000000200000000
+
.asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
.align 64
.size _vpaes_consts,.-_vpaes_consts
@@ -1222,6 +1513,10 @@ se_handler:
.rva .LSEH_end_${PREFIX}_cbc_encrypt
.rva .LSEH_info_${PREFIX}_cbc_encrypt
+ .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
+ .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks
+ .rva .LSEH_info_${PREFIX}_ctr32_encrypt_blocks
+
.section .xdata
.align 8
.LSEH_info_${PREFIX}_set_encrypt_key:
@@ -1244,6 +1539,10 @@ se_handler:
.byte 9,0,0,0
.rva se_handler
.rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
+.LSEH_info_${PREFIX}_ctr32_encrypt_blocks:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
___
}