diff options
Diffstat (limited to 'src/third_party/sike/asm/fp-x86_64.pl')
-rwxr-xr-x | src/third_party/sike/asm/fp-x86_64.pl | 1626 |
1 files changed, 0 insertions, 1626 deletions
diff --git a/src/third_party/sike/asm/fp-x86_64.pl b/src/third_party/sike/asm/fp-x86_64.pl deleted file mode 100755 index cffde1a8..00000000 --- a/src/third_party/sike/asm/fp-x86_64.pl +++ /dev/null @@ -1,1626 +0,0 @@ -#! /usr/bin/env perl -# -# April 2019 -# -# Abstract: field arithmetic in x64 assembly for SIDH/p434 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; -*STDOUT=*OUT; - -$PREFIX="sike"; -$bmi2_adx = 1; - -$code.=<<___; -.text - -# p434 x 2 -.Lp434x2: -.quad 0xFFFFFFFFFFFFFFFE -.quad 0xFFFFFFFFFFFFFFFF -.quad 0xFB82ECF5C5FFFFFF -.quad 0xF78CB8F062B15D47 -.quad 0xD9F8BFAD038A40AC -.quad 0x0004683E4E2EE688 - -# p434 + 1 -.Lp434p1: -.quad 0xFDC1767AE3000000 -.quad 0x7BC65C783158AEA3 -.quad 0x6CFC5FD681C52056 -.quad 0x0002341F27177344 - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -___ - -# Jump to alternative implemenatation provided as an -# argument in case CPU supports ADOX/ADCX and MULX instructions. -sub alt_impl { - $jmp_func = shift; - - $body=<<___; - lea OPENSSL_ia32cap_P(%rip), %rcx - mov 8(%rcx), %rcx - and \$0x80100, %ecx - cmp \$0x80100, %ecx - je $jmp_func - -___ - return $body -} - -# Performs schoolbook multiplication of 2 192-bit numbers. Uses -# MULX instruction. Result is stored in 192 bits pointed by $DST. -sub mul192 { - my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_; - my ($ML0,$ML8,$ML16)=map("$idxM0+$_($M0)",(0,8,16)); - my ($MR0,$MR8,$MR16)=map("$idxM1+$_($M1)",(0,8,16)); - my ($D0,$D1,$D2,$D3,$D4,$D5)=map("$idxDST+$_($DST)",(0,8,16,24,32,40)); - - $body=<<___; - mov $ML0, %rdx - mulx $MR0, $T1, $T0 # T0:T1 = A0*B0 - mov $T1, $D0 # DST0 - mulx $MR8, $T2, $T1 # T1:T2 = A0*B1 - xor %rax, %rax - adox $T2, $T0 - mulx $MR16,$T3, $T2 # T2:T3 = A0*B2 - adox $T3, $T1 - - mov $ML8, %rdx - mulx $MR0, $T4, $T3 # T3:T4 = A1*B0 - adox %rax, $T2 - xor %rax, %rax - - mulx $MR8, $T6, $T5 # T6:T7 = A1*B1 - adox $T0, $T4 - mov $T4, $D1 # DST1 - adcx $T6, $T3 - - mulx $MR16,$T0, $T6 # T6:T0 = A1*B2 - adox $T1, $T3 - adcx $T0, $T5 - adcx %rax, $T6 - adox $T2, $T5 - - mov $ML16,%rdx - mulx $MR0, $T0, $T1 # T1:T0 = A2*B0 - adox %rax, $T6 - xor %rax, %rax - - mulx $MR8, $T2, $T4 # T4:T2 = A2*B1 - adox $T3, $T0 - mov $T0, $D2 # DST2 - adcx $T5, $T1 - - mulx $MR16,$T3, $T0 # T0:T3 = A2*B2 - adcx $T6, $T4 - adcx %rax, $T0 - adox $T2, $T1 - adox $T4, $T3 - adox %rax, $T0 - mov $T1, $D3 # DST3 - mov $T3, $D4 # DST4 - mov $T0, $D5 # DST5 - -___ - return $body; -} - -# Performs schoolbook multiplication of 2 256-bit numbers. Uses -# MULX instruction. Result is stored in 256 bits pointed by $DST. -sub mul256 { - my ($idxM0,$M0,$idxM1,$M1,$idxDST,$DST,$T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7,$T8,$T9)=@_; - my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_($M0)",(0,8,16,24)); - my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_($M1)",(0,8,16,24)); - my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_($DST)",(0,8,16,24,32,40,48,56)); - - $body=<<___; - mov $ML0, %rdx - mulx $MR0, $T1, $T0 # T0:T1 = A0*B0 - mov $T1, $D0 # DST0_final - mulx $MR8, $T2, $T1 # T1:T2 = A0*B1 - xor %rax, %rax - adox $T2, $T0 - mulx $MR16,$T3, $T2 # T2:T3 = A0*B2 - adox $T3, $T1 - mulx $MR24,$T4, $T3 # T3:T4 = A0*B3 - adox $T4, $T2 - - mov $ML8, %rdx - mulx $MR0, $T4, $T5 # T5:T4 = A1*B0 - adox %rax, $T3 - xor %rax, %rax - mulx $MR8, $T7, $T6 # T6:T7 = A1*B1 - adox $T0, $T4 - mov $T4, $D1 # DST1_final - adcx $T7, $T5 - mulx $MR16,$T8, $T7 # T7:T8 = A1*B2 - adcx $T8, $T6 - adox $T1, $T5 - mulx $MR24,$T9, $T8 # T8:T9 = A1*B3 - adcx $T9, $T7 - adcx %rax, $T8 - adox $T2, $T6 - - mov $ML16,%rdx - mulx $MR0, $T0, $T1 # T1:T0 = A2*B0 - adox $T3, $T7 - adox %rax, $T8 - xor %rax, %rax - mulx $MR8, $T3, $T2 # T2:T3 = A2*B1 - adox $T5, $T0 - mov $T0, $D2 # DST2_final - adcx $T3, $T1 - mulx $MR16,$T4, $T3 # T3:T4 = A2*B2 - adcx $T4, $T2 - adox $T6, $T1 - mulx $MR24,$T9, $T4 # T3:T4 = A2*B3 - adcx $T9, $T3 - adcx %rax, $T4 - - adox $T7, $T2 - adox $T8, $T3 - adox %rax, $T4 - - mov $ML24,%rdx - mulx $MR0, $T0, $T5 # T5:T0 = A3*B0 - xor %rax, %rax - mulx $MR8, $T7, $T6 # T6:T7 = A3*B1 - adcx $T7, $T5 - adox $T0, $T1 - mulx $MR16, $T8, $T7 # T7:T8 = A3*B2 - adcx $T8, $T6 - adox $T5, $T2 - mulx $MR24, $T9, $T8 # T8:T9 = A3*B3 - adcx $T9, $T7 - adcx %rax, $T8 - adox $T6, $T3 - adox $T7, $T4 - adox %rax, $T8 - mov $T1, $D3 # DST3_final - mov $T2, $D4 # DST4_final - mov $T3, $D5 # DST5_final - mov $T4, $D6 # DST6_final - mov $T8, $D7 # DST7_final - -___ - return $body; -} - -# Performs schoolbook multiplication of 64-bit with 256-bit -# number. -sub mul64x256 { - my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_; - my $body.=<<___; - mov $idxM0($M0), $T5 - - xor $T2, $T2 - mov 0+$M1, %rax - mul $T5 - mov %rax, $T0 # C0 - mov %rdx, $T1 - - xor $T3, $T3 - mov 8+$M1, %rax - mul $T5 - add %rax, $T1 # C1 - adc %rdx, $T2 - - xor $T4, $T4 - mov 16+$M1, %rax - mul $T5 - add %rax, $T2 # C2 - adc %rdx, $T3 - - mov 24+$M1, %rax - mul $T5 - add %rax, $T3 # C3 - adc %rdx, $T4 # C4 -___ - return $body; -} - -# Performs schoolbook multiplication of 64-bit with 256-bit -# number. Uses MULX and ADOX instructions. -sub mulx64x256 { - my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5)=@_; - my $body.=<<___; - xor %rax, %rax - mov $idxM0($M0), %rdx - mulx 0+$M1, $T0, $T1 # T0 <- C0 - mulx 8+$M1, $T4, $T2 - mulx 16+$M1, $T5, $T3 - - adox $T4, $T1 # T1 <- C1 - adox $T5, $T2 # T2 <- C2 - - mulx 24+$M1, $T5, $T4 - adox $T5, $T3 # T3 <- C3 - adox %rax, $T4 # T4 <- C4 -___ - return $body; -} - -# Performs schoolbook multiplication of 128-bit with 256-bit -# number. Destroys RAX and RDX -sub mul128x256 { - my ($idxMA,$MA,$MB,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1)=@_; - my ($MA0,$MA8)=map("$idxMA+$_($MA)", (0,8)); - my $body.=<<___; - # A0 x B0 - mov $MA0, $T0 - mov 0+$MB, %rax - mul $T0 - xor $C2, $C2 - mov %rax, $C0 # c0 - mov %rdx, $C1 - - # A0 x B1 - mov 8+$MB, %rax - mul $T0 - xor $C3, $C3 - add %rax, $C1 - adc %rdx, $C2 - - # A1 x B0 - mov $MA8, $T1 - mov 0+$MB, %rax - mul $T1 - add %rax, $C1 - adc %rdx, $C2 - adc \$0x0, $C3 - - # A0 x B2 - xor $C4, $C4 - mov 16+$MB, %rax - mul $T0 - add %rax, $C2 - adc %rdx, $C3 - adc \$0x0, $C4 - - # A1 x B1 - mov 8+$MB, %rax - mul $T1 - add %rax, $C2 # c2 - adc %rdx, $C3 - adc \$0x0, $C4 - - # A0 x B3 - mov 24+$MB, %rax - mul $T0 - xor $C5, $C5 - add %rax, $C3 - adc %rdx, $C4 - adc \$0x0, $C5 - - # A1 x B2 - mov 16+$MB, %rax - mul $T1 - add %rax, $C3 # c3 - adc %rdx, $C4 - adc \$0x0, $C5 - - # A1 x B3 - mov 24+$MB, %rax - mul $T1 - add %rax, $C4 - adc %rdx, $C5 - -___ - return $body; -} - -# Performs schoolbook multiplication of 128-bit with 256-bit -# number. Uses MULX, ADOX, ADCX instruction. -sub mulx128x256 { - my ($idxM0,$M0,$M1,$T0,$T1,$T2,$T3,$T4,$T5,$T6)=@_; - my ($MUL0,$MUL8)=map("$idxM0+$_($M0)", (0,8)); - my $body.=<<___; - xor %rax, %rax - mov $MUL0, %rdx - mulx 0+$M1, $T0, $T1 # T0 <- C0 - mulx 8+$M1, $T4, $T2 - mulx 16+$M1, $T5, $T3 - - adox $T4, $T1 # T1: interm1 - adox $T5, $T2 # T2: interm2 - - mulx 24+$M1, $T5, $T4 - adox $T5, $T3 # T3: interm3 - adox %rax, $T4 # T4: interm4 - - xor %rax, %rax - mov $MUL8, %rdx - mulx 0+$M1, $T5, $T6 - adcx $T5, $T1 # T1 <- C1 - adcx $T6, $T2 - - mulx 8+$M1, $T6, $T5 - adcx $T5, $T3 - adox $T6, $T2 # T2 <- C2 - - mulx 16+$M1, $T6, $T5 - adcx $T5, $T4 - adox $T6, $T3 # T3 <- C3 - - mulx 24+$M1, $T6, $T5 - adcx %rax, $T5 - adox $T6, $T4 # T4 <- C4 - adox %rax, $T5 # T5 <- C5 -___ - return $body; -} - -# Compute z = x + y (mod p). -# Operation: c [rdx] = a [rdi] + b [rsi] -$code.=<<___; -.globl ${PREFIX}_fpadd -.type ${PREFIX}_fpadd,\@function,3 -${PREFIX}_fpadd: -.cfi_startproc - push %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - push %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - push %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14, -32 - - xor %rax, %rax - - mov 0x0(%rdi), %r8 - add 0x0(%rsi), %r8 - mov 0x8(%rdi), %r9 - adc 0x8(%rsi), %r9 - mov 0x10(%rdi), %r10 - adc 0x10(%rsi), %r10 - mov 0x18(%rdi), %r11 - adc 0x18(%rsi), %r11 - mov 0x20(%rdi), %r12 - adc 0x20(%rsi), %r12 - mov 0x28(%rdi), %r13 - adc 0x28(%rsi), %r13 - mov 0x30(%rdi), %r14 - adc 0x30(%rsi), %r14 - - mov .Lp434x2(%rip), %rcx - sub %rcx, %r8 - mov 0x8+.Lp434x2(%rip), %rcx - sbb %rcx, %r9 - sbb %rcx, %r10 - mov 0x10+.Lp434x2(%rip), %rcx - sbb %rcx, %r11 - mov 0x18+.Lp434x2(%rip), %rcx - sbb %rcx, %r12 - mov 0x20+.Lp434x2(%rip), %rcx - sbb %rcx, %r13 - mov 0x28+.Lp434x2(%rip), %rcx - sbb %rcx, %r14 - - sbb \$0, %rax - - mov .Lp434x2(%rip), %rdi - and %rax, %rdi - mov 0x8+.Lp434x2(%rip), %rsi - and %rax, %rsi - mov 0x10+.Lp434x2(%rip), %rcx - and %rax, %rcx - - add %rdi, %r8 - mov %r8, 0x0(%rdx) - adc %rsi, %r9 - mov %r9, 0x8(%rdx) - adc %rsi, %r10 - mov %r10, 0x10(%rdx) - adc %rcx, %r11 - mov %r11, 0x18(%rdx) - - setc %cl - mov 0x18+.Lp434x2(%rip), %r8 - and %rax, %r8 - mov 0x20+.Lp434x2(%rip), %r9 - and %rax, %r9 - mov 0x28+.Lp434x2(%rip), %r10 - and %rax, %r10 - bt \$0, %rcx - - adc %r8, %r12 - mov %r12, 0x20(%rdx) - adc %r9, %r13 - mov %r13, 0x28(%rdx) - adc %r10, %r14 - mov %r14, 0x30(%rdx) - - pop %r14 -.cfi_adjust_cfa_offset -8 - pop %r13 -.cfi_adjust_cfa_offset -8 - pop %r12 -.cfi_adjust_cfa_offset -8 - ret -.cfi_endproc -___ - -# Loads data to XMM0 and XMM1 and -# conditionaly swaps depending on XMM3 -sub cswap_block16() { - my $idx = shift; - $idx *= 16; - (" - movdqu $idx(%rdi), %xmm0 - movdqu $idx(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - pxor %xmm0, %xmm2 - pand %xmm3, %xmm2 - pxor %xmm2, %xmm0 - pxor %xmm2, %xmm1 - movdqu %xmm0, $idx(%rdi) - movdqu %xmm1, $idx(%rsi) - "); -} - -# Conditionally swaps bits in x and y in constant time. -# mask indicates bits to be swapped (set bits are swapped) -# Operation: [rdi] <-> [rsi] if rdx==1 -sub sike_cswap { - # P[0] with Q[0] - foreach ( 0.. 6){$BLOCKS.=eval "&cswap_block16($_)";} - # P[1] with Q[1] - foreach ( 7..13){$BLOCKS.=eval "&cswap_block16($_)";} - - my $body =<<___; -.globl ${PREFIX}_cswap_asm -.type ${PREFIX}_cswap_asm,\@function,3 -${PREFIX}_cswap_asm: - # Fill XMM3. After this step first half of XMM3 is - # just zeros and second half is whatever in RDX - mov %rdx, %xmm3 - - # Copy lower double word everywhere else. So that - # XMM3=RDX|RDX. As RDX has either all bits set - # or non result will be that XMM3 has also either - # all bits set or non of them. 68 = 01000100b - pshufd \$68, %xmm3, %xmm3 - $BLOCKS - ret -___ - ($body) -} -$code.=&sike_cswap(); - - -# Field subtraction -# Operation: c [rdx] = a [rdi] - b [rsi] -$code.=<<___; -.globl ${PREFIX}_fpsub -.type ${PREFIX}_fpsub,\@function,3 -${PREFIX}_fpsub: -.cfi_startproc - push %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - push %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - push %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14, -32 - - xor %rax, %rax - - mov 0x0(%rdi), %r8 - sub 0x0(%rsi), %r8 - mov 0x8(%rdi), %r9 - sbb 0x8(%rsi), %r9 - mov 0x10(%rdi), %r10 - sbb 0x10(%rsi), %r10 - mov 0x18(%rdi), %r11 - sbb 0x18(%rsi), %r11 - mov 0x20(%rdi), %r12 - sbb 0x20(%rsi), %r12 - mov 0x28(%rdi), %r13 - sbb 0x28(%rsi), %r13 - mov 0x30(%rdi), %r14 - sbb 0x30(%rsi), %r14 - - sbb \$0x0, %rax - - mov .Lp434x2(%rip), %rdi - and %rax, %rdi - mov 0x08+.Lp434x2(%rip), %rsi - and %rax, %rsi - mov 0x10+.Lp434x2(%rip), %rcx - and %rax, %rcx - - add %rdi, %r8 - mov %r8, 0x0(%rdx) - adc %rsi, %r9 - mov %r9, 0x8(%rdx) - adc %rsi, %r10 - mov %r10, 0x10(%rdx) - adc %rcx, %r11 - mov %r11, 0x18(%rdx) - - setc %cl - mov 0x18+.Lp434x2(%rip), %r8 - and %rax, %r8 - mov 0x20+.Lp434x2(%rip), %r9 - and %rax, %r9 - mov 0x28+.Lp434x2(%rip), %r10 - and %rax, %r10 - bt \$0x0, %rcx - - adc %r8, %r12 - adc %r9, %r13 - adc %r10, %r14 - mov %r12, 0x20(%rdx) - mov %r13, 0x28(%rdx) - mov %r14, 0x30(%rdx) - - pop %r14 -.cfi_adjust_cfa_offset -8 - pop %r13 -.cfi_adjust_cfa_offset -8 - pop %r12 -.cfi_adjust_cfa_offset -8 - ret -.cfi_endproc -___ - -# 434-bit multiprecision addition -# Operation: c [rdx] = a [rdi] + b [rsi] -$code.=<<___; -.globl ${PREFIX}_mpadd_asm -.type ${PREFIX}_mpadd_asm,\@function,3 -${PREFIX}_mpadd_asm: -.cfi_startproc - mov 0x0(%rdi), %r8; - mov 0x8(%rdi), %r9 - mov 0x10(%rdi), %r10 - mov 0x18(%rdi), %r11 - mov 0x20(%rdi), %rcx - add 0x0(%rsi), %r8 - adc 0x8(%rsi), %r9 - adc 0x10(%rsi), %r10 - adc 0x18(%rsi), %r11 - adc 0x20(%rsi), %rcx - mov %r8, 0x0(%rdx) - mov %r9, 0x8(%rdx) - mov %r10, 0x10(%rdx) - mov %r11, 0x18(%rdx) - mov %rcx, 0x20(%rdx) - - mov 0x28(%rdi), %r8 - mov 0x30(%rdi), %r9 - adc 0x28(%rsi), %r8 - adc 0x30(%rsi), %r9 - mov %r8, 0x28(%rdx) - mov %r9, 0x30(%rdx) - ret -.cfi_endproc -___ - -# 2x434-bit multiprecision subtraction -# Operation: c [rdx] = a [rdi] - b [rsi]. -# Returns borrow mask -$code.=<<___; -.globl ${PREFIX}_mpsubx2_asm -.type ${PREFIX}_mpsubx2_asm,\@function,3 -${PREFIX}_mpsubx2_asm: -.cfi_startproc - xor %rax, %rax - - mov 0x0(%rdi), %r8 - mov 0x8(%rdi), %r9 - mov 0x10(%rdi), %r10 - mov 0x18(%rdi), %r11 - mov 0x20(%rdi), %rcx - sub 0x0(%rsi), %r8 - sbb 0x8(%rsi), %r9 - sbb 0x10(%rsi), %r10 - sbb 0x18(%rsi), %r11 - sbb 0x20(%rsi), %rcx - mov %r8, 0x0(%rdx) - mov %r9, 0x8(%rdx) - mov %r10, 0x10(%rdx) - mov %r11, 0x18(%rdx) - mov %rcx, 0x20(%rdx) - - mov 0x28(%rdi), %r8 - mov 0x30(%rdi), %r9 - mov 0x38(%rdi), %r10 - mov 0x40(%rdi), %r11 - mov 0x48(%rdi), %rcx - sbb 0x28(%rsi), %r8 - sbb 0x30(%rsi), %r9 - sbb 0x38(%rsi), %r10 - sbb 0x40(%rsi), %r11 - sbb 0x48(%rsi), %rcx - mov %r8, 0x28(%rdx) - mov %r9, 0x30(%rdx) - mov %r10, 0x38(%rdx) - mov %r11, 0x40(%rdx) - mov %rcx, 0x48(%rdx) - - mov 0x50(%rdi), %r8 - mov 0x58(%rdi), %r9 - mov 0x60(%rdi), %r10 - mov 0x68(%rdi), %r11 - sbb 0x50(%rsi), %r8 - sbb 0x58(%rsi), %r9 - sbb 0x60(%rsi), %r10 - sbb 0x68(%rsi), %r11 - sbb \$0x0, %rax - mov %r8, 0x50(%rdx) - mov %r9, 0x58(%rdx) - mov %r10, 0x60(%rdx) - mov %r11, 0x68(%rdx) - ret -.cfi_endproc -___ - - -# Double 2x434-bit multiprecision subtraction -# Operation: c [rdx] = c [rdx] - a [rdi] - b [rsi] -$code.=<<___; -.globl ${PREFIX}_mpdblsubx2_asm -.type ${PREFIX}_mpdblsubx2_asm,\@function,3 -${PREFIX}_mpdblsubx2_asm: -.cfi_startproc - push %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12, -16 - push %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13, -24 - - xor %rax, %rax - - # ci:low = c:low - a:low - mov 0x0(%rdx), %r8 - mov 0x8(%rdx), %r9 - mov 0x10(%rdx), %r10 - mov 0x18(%rdx), %r11 - mov 0x20(%rdx), %r12 - mov 0x28(%rdx), %r13 - mov 0x30(%rdx), %rcx - sub 0x0(%rdi), %r8 - sbb 0x8(%rdi), %r9 - sbb 0x10(%rdi), %r10 - sbb 0x18(%rdi), %r11 - sbb 0x20(%rdi), %r12 - sbb 0x28(%rdi), %r13 - sbb 0x30(%rdi), %rcx - adc \$0x0, %rax - - # c:low = ci:low - b:low - sub 0x0(%rsi), %r8 - sbb 0x8(%rsi), %r9 - sbb 0x10(%rsi), %r10 - sbb 0x18(%rsi), %r11 - sbb 0x20(%rsi), %r12 - sbb 0x28(%rsi), %r13 - sbb 0x30(%rsi), %rcx - adc \$0x0, %rax - - # store c:low - mov %r8, 0x0(%rdx) - mov %r9, 0x8(%rdx) - mov %r10, 0x10(%rdx) - mov %r11, 0x18(%rdx) - mov %r12, 0x20(%rdx) - mov %r13, 0x28(%rdx) - mov %rcx, 0x30(%rdx) - - # ci:high = c:high - a:high - mov 0x38(%rdx), %r8 - mov 0x40(%rdx), %r9 - mov 0x48(%rdx), %r10 - mov 0x50(%rdx), %r11 - mov 0x58(%rdx), %r12 - mov 0x60(%rdx), %r13 - mov 0x68(%rdx), %rcx - - sub %rax, %r8 - sbb 0x38(%rdi), %r8 - sbb 0x40(%rdi), %r9 - sbb 0x48(%rdi), %r10 - sbb 0x50(%rdi), %r11 - sbb 0x58(%rdi), %r12 - sbb 0x60(%rdi), %r13 - sbb 0x68(%rdi), %rcx - - # c:high = ci:high - b:high - sub 0x38(%rsi), %r8 - sbb 0x40(%rsi), %r9 - sbb 0x48(%rsi), %r10 - sbb 0x50(%rsi), %r11 - sbb 0x58(%rsi), %r12 - sbb 0x60(%rsi), %r13 - sbb 0x68(%rsi), %rcx - - # store c:high - mov %r8, 0x38(%rdx) - mov %r9, 0x40(%rdx) - mov %r10, 0x48(%rdx) - mov %r11, 0x50(%rdx) - mov %r12, 0x58(%rdx) - mov %r13, 0x60(%rdx) - mov %rcx, 0x68(%rdx) - - pop %r13 -.cfi_adjust_cfa_offset -8 - pop %r12 -.cfi_adjust_cfa_offset -8 - ret -.cfi_endproc - -___ - -sub redc_common { - my ($mul01, $mul23, $mul45, $mul67)=@_; - my $body=<<___; - $mul01 - xor %rcx, %rcx - add 0x18(%rdi), %r8 - adc 0x20(%rdi), %r9 - adc 0x28(%rdi), %r10 - adc 0x30(%rdi), %r11 - adc 0x38(%rdi), %r12 - adc 0x40(%rdi), %r13 - adc 0x48(%rdi), %rcx - mov %r8, 0x18(%rdi) - mov %r9, 0x20(%rdi) - mov %r10, 0x28(%rdi) - mov %r11, 0x30(%rdi) - mov %r12, 0x38(%rdi) - mov %r13, 0x40(%rdi) - mov %rcx, 0x48(%rdi) - mov 0x50(%rdi), %r8 - mov 0x58(%rdi), %r9 - mov 0x60(%rdi), %r10 - mov 0x68(%rdi), %r11 - adc \$0x0, %r8 - adc \$0x0, %r9 - adc \$0x0, %r10 - adc \$0x0, %r11 - mov %r8, 0x50(%rdi) - mov %r9, 0x58(%rdi) - mov %r10, 0x60(%rdi) - mov %r11, 0x68(%rdi) - - $mul23 - xor %rcx, %rcx - add 0x28(%rdi), %r8 - adc 0x30(%rdi), %r9 - adc 0x38(%rdi), %r10 - adc 0x40(%rdi), %r11 - adc 0x48(%rdi), %r12 - adc 0x50(%rdi), %r13 - adc 0x58(%rdi), %rcx - mov %r8, 0x28(%rdi) - mov %r9, 0x30(%rdi) - mov %r10, 0x38(%rdi) - mov %r11, 0x40(%rdi) - mov %r12, 0x48(%rdi) - mov %r13, 0x50(%rdi) - mov %rcx, 0x58(%rdi) - mov 0x60(%rdi), %r8 - mov 0x68(%rdi), %r9 - adc \$0x0, %r8 - adc \$0x0, %r9 - mov %r8, 0x60(%rdi) - mov %r9, 0x68(%rdi) - - $mul45 - xor %rcx, %rcx - add 0x38(%rdi), %r8 - adc 0x40(%rdi), %r9 - adc 0x48(%rdi), %r10 - adc 0x50(%rdi), %r11 - adc 0x58(%rdi), %r12 - adc 0x60(%rdi), %r13 - adc 0x68(%rdi), %rcx - mov %r8, 0x0(%rsi) # C0 - mov %r9, 0x8(%rsi) # C1 - mov %r10, 0x48(%rdi) - mov %r11, 0x50(%rdi) - mov %r12, 0x58(%rdi) - mov %r13, 0x60(%rdi) - mov %rcx, 0x68(%rdi) - - $mul67 - add 0x48(%rdi), %r8 - adc 0x50(%rdi), %r9 - adc 0x58(%rdi), %r10 - adc 0x60(%rdi), %r11 - adc 0x68(%rdi), %r12 - mov %r8, 0x10(%rsi) # C2 - mov %r9, 0x18(%rsi) # C3 - mov %r10, 0x20(%rsi) # C4 - mov %r11, 0x28(%rsi) # C5 - mov %r12, 0x30(%rsi) # C6 -___ - return $body; -} - -# Optimized Montgomery reduction for CPUs, based on method described -# in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015. -# Operation: c [rsi] = a [rdi] -# NOTE: a=c is not allowed -sub sike_rdc { - my $jump_redc_bdw=&alt_impl(".Lrdc_bdw") if ($bmi2_adx); - # a[0-1] x .Lp434p1 --> result: r8:r13 - my $mulx1=&mulx128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx"); - # a[2-3] x .Lp434p1 --> result: r8:r13 - my $mulx2=&mulx128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx"); - # a[4-5] x .Lp434p1 --> result: r8:r13 - my $mulx3=&mulx128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13)),"%rcx"); - # a[6-7] x .Lp434p1 --> result: r8:r13 - my $mulx4=&mulx64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13))); - - # a[0-1] x .Lp434p1 --> result: r8:r13 - my $mul1=&mul128x256( 0,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx"); - # a[2-3] x .Lp434p1 --> result: r8:r13 - my $mul2=&mul128x256(16,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx"); - # a[4-5] x .Lp434p1 --> result: r8:r13 - my $mul3=&mul128x256(32,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..14)),"%rcx"); - # a[6-7] x .Lp434p1 --> result: r8:r13 - my $mul4=&mul64x256( 48,"%rdi",".Lp434p1(%rip)",map("%r$_",(8..13))); - - my $redc_mul=&redc_common($mul1, $mul2, $mul3, $mul4); - my $redc_bdw=&redc_common($mulx1, $mulx2, $mulx3, $mulx4) if ($bmi2_adx); - - # REDC for Broadwell CPUs - my $code=<<___; - .Lrdc_bdw: - .cfi_startproc - # sike_fprdc has already pushed r12--15 by this point. - .cfi_adjust_cfa_offset 32 - .cfi_offset r12, -16 - .cfi_offset r13, -24 - .cfi_offset r14, -32 - .cfi_offset r15, -40 - - $redc_bdw - - pop %r15 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r15 - pop %r14 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r14 - pop %r13 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r13 - pop %r12 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r12 - ret - .cfi_endproc -___ - - # REDC for CPUs older than Broadwell - $code.=<<___; - .globl ${PREFIX}_fprdc - .type ${PREFIX}_fprdc,\@function,3 - ${PREFIX}_fprdc: - .cfi_startproc - push %r12 - .cfi_adjust_cfa_offset 8 - .cfi_offset r12, -16 - push %r13 - .cfi_adjust_cfa_offset 8 - .cfi_offset r13, -24 - push %r14 - .cfi_adjust_cfa_offset 8 - .cfi_offset r14, -32 - push %r15 - .cfi_adjust_cfa_offset 8 - .cfi_offset r15, -40 - - # Jump to optimized implementation if - # CPU supports ADCX/ADOX/MULX - $jump_redc_bdw - # Otherwise use generic implementation - $redc_mul - - pop %r15 - .cfi_adjust_cfa_offset -8 - pop %r14 - .cfi_adjust_cfa_offset -8 - pop %r13 - .cfi_adjust_cfa_offset -8 - pop %r12 - .cfi_adjust_cfa_offset -8 - ret - .cfi_endproc -___ - return $code; -} -$code.=&sike_rdc(); - -# 434-bit multiplication using Karatsuba (one level), -# schoolbook (one level). Uses MULX/ADOX/ADCX instructions -# available on Broadwell micro-architectures and newer. -sub mul_bdw { - # [rsp] <- (AH+AL) x (BH+BL) - my $mul256_low=&mul256(0,"%rsp",32,"%rsp",0,"%rsp",map("%r$_",(8..15)),"%rbx","%rbp"); - # [rcx] <- AL x BL - my $mul256_albl=&mul256(0,"%rdi",0,"%rsi",0,"%rcx",map("%r$_",(8..15)),"%rbx","%rbp"); - # [rcx+64] <- AH x BH - my $mul192_ahbh=&mul192(32,"%rdi",32,"%rsi",64,"%rcx",map("%r$_",(8..14))); - - $body=<<___; - - mov %rdx, %rcx - xor %rax, %rax - - # r8-r11 <- AH + AL, rax <- mask - mov 0x0(%rdi), %r8 - mov 0x8(%rdi), %r9 - mov 0x10(%rdi), %r10 - mov 0x18(%rdi), %r11 - - push %rbx - .cfi_adjust_cfa_offset 8 - .cfi_offset rbx, -48 - push %rbp - .cfi_offset rbp, -56 - .cfi_adjust_cfa_offset 8 - sub \$96, %rsp - .cfi_adjust_cfa_offset 96 - - add 0x20(%rdi), %r8 - adc 0x28(%rdi), %r9 - adc 0x30(%rdi), %r10 - adc \$0x0, %r11 - sbb \$0x0, %rax - mov %r8, 0x0(%rsp) - mov %r9, 0x8(%rsp) - mov %r10, 0x10(%rsp) - mov %r11, 0x18(%rsp) - - # r12-r15 <- BH + BL, rbx <- mask - xor %rbx, %rbx - mov 0x0(%rsi), %r12 - mov 0x8(%rsi), %r13 - mov 0x10(%rsi), %r14 - mov 0x18(%rsi), %r15 - add 0x20(%rsi), %r12 - adc 0x28(%rsi), %r13 - adc 0x30(%rsi), %r14 - adc \$0x0, %r15 - sbb \$0x0, %rbx - mov %r12, 0x20(%rsp) - mov %r13, 0x28(%rsp) - mov %r14, 0x30(%rsp) - mov %r15, 0x38(%rsp) - - # r12-r15 <- masked (BH + BL) - and %rax, %r12 - and %rax, %r13 - and %rax, %r14 - and %rax, %r15 - - # r8-r11 <- masked (AH + AL) - and %rbx, %r8 - and %rbx, %r9 - and %rbx, %r10 - and %rbx, %r11 - - # r8-r11 <- masked (AH + AL) + masked (BH + BL) - add %r12, %r8 - adc %r13, %r9 - adc %r14, %r10 - adc %r15, %r11 - mov %r8, 0x40(%rsp) - mov %r9, 0x48(%rsp) - mov %r10, 0x50(%rsp) - mov %r11, 0x58(%rsp) - - # [rsp] <- CM = (AH+AL) x (BH+BL) - $mul256_low - # [rcx] <- CL = AL x BL (Result c0-c3) - $mul256_albl - # [rcx+64] <- CH = AH x BH - $mul192_ahbh - - # r8-r11 <- (AH+AL) x (BH+BL), final step - mov 0x40(%rsp), %r8 - mov 0x48(%rsp), %r9 - mov 0x50(%rsp), %r10 - mov 0x58(%rsp), %r11 - - mov 0x20(%rsp), %rax - add %rax, %r8 - mov 0x28(%rsp), %rax - adc %rax, %r9 - mov 0x30(%rsp), %rax - adc %rax, %r10 - mov 0x38(%rsp), %rax - adc %rax, %r11 - - # [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - mov 0x0(%rsp), %r12 - mov 0x8(%rsp), %r13 - mov 0x10(%rsp), %r14 - mov 0x18(%rsp), %r15 - sub 0x0(%rcx), %r12 - sbb 0x8(%rcx), %r13 - sbb 0x10(%rcx), %r14 - sbb 0x18(%rcx), %r15 - sbb 0x20(%rcx), %r8 - sbb 0x28(%rcx), %r9 - sbb 0x30(%rcx), %r10 - sbb 0x38(%rcx), %r11 - - # r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH - sub 0x40(%rcx), %r12 - sbb 0x48(%rcx), %r13 - sbb 0x50(%rcx), %r14 - sbb 0x58(%rcx), %r15 - sbb 0x60(%rcx), %r8 - sbb 0x68(%rcx), %r9 - sbb \$0x0, %r10 - sbb \$0x0, %r11 - - add 0x20(%rcx), %r12 - mov %r12, 0x20(%rcx) # Result C4-C7 - adc 0x28(%rcx), %r13 - mov %r13, 0x28(%rcx) - adc 0x30(%rcx), %r14 - mov %r14, 0x30(%rcx) - adc 0x38(%rcx), %r15 - mov %r15, 0x38(%rcx) - adc 0x40(%rcx), %r8 - mov %r8, 0x40(%rcx) # Result C8-C15 - adc 0x48(%rcx), %r9 - mov %r9, 0x48(%rcx) - adc 0x50(%rcx), %r10 - mov %r10, 0x50(%rcx) - adc 0x58(%rcx), %r11 - mov %r11, 0x58(%rcx) - mov 0x60(%rcx), %r12 - adc \$0x0, %r12 - mov %r12, 0x60(%rcx) - mov 0x68(%rcx), %r13 - adc \$0x0, %r13 - mov %r13, 0x68(%rcx) - - add \$96, %rsp - .cfi_adjust_cfa_offset -96 - pop %rbp - .cfi_adjust_cfa_offset -8 - .cfi_same_value rbp - pop %rbx - .cfi_adjust_cfa_offset -8 - .cfi_same_value rbx -___ - return $body; -} - -# 434-bit multiplication using Karatsuba (one level), -# schoolbook (one level). -sub mul { - my $code=<<___; - mov %rdx, %rcx - - sub \$112, %rsp # Allocating space in stack - .cfi_adjust_cfa_offset 112 - - # rcx[0-3] <- AH+AL - xor %rax, %rax - mov 0x20(%rdi), %r8 - mov 0x28(%rdi), %r9 - mov 0x30(%rdi), %r10 - xor %r11, %r11 - add 0x0(%rdi), %r8 - adc 0x8(%rdi), %r9 - adc 0x10(%rdi), %r10 - adc 0x18(%rdi), %r11 - # store AH+AL mask - sbb \$0, %rax - mov %rax, 0x40(%rsp) - # store AH+AL in 0-0x18(rcx) - mov %r8, 0x0(%rcx) - mov %r9, 0x8(%rcx) - mov %r10, 0x10(%rcx) - mov %r11, 0x18(%rcx) - - # r12-r15 <- BH+BL - xor %rdx, %rdx - mov 0x20(%rsi), %r12 - mov 0x28(%rsi), %r13 - mov 0x30(%rsi), %r14 - xor %r15, %r15 - add 0x0(%rsi), %r12 - adc 0x8(%rsi), %r13 - adc 0x10(%rsi), %r14 - adc 0x18(%rsi), %r15 - sbb \$0x0, %rdx - # store BH+BL mask - mov %rdx, 0x48(%rsp) - - # (rsp[0-0x38]) <- (AH+AL)*(BH+BL) - mov (%rcx), %rax - mul %r12 - mov %rax, (%rsp) # c0 - mov %rdx, %r8 - - xor %r9, %r9 - mov (%rcx), %rax - mul %r13 - add %rax, %r8 - adc %rdx, %r9 - - xor %r10, %r10 - mov 0x8(%rcx), %rax - mul %r12 - add %rax, %r8 - mov %r8, 0x8(%rsp) # c1 - adc %rdx, %r9 - adc \$0x0,%r10 - - xor %r8, %r8 - mov (%rcx), %rax - mul %r14 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x10(%rcx), %rax - mul %r12 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x8(%rcx), %rax - mul %r13 - add %rax, %r9 - mov %r9, 0x10(%rsp) # c2 - adc %rdx, %r10 - adc \$0x0, %r8 - - xor %r9, %r9 - mov (%rcx),%rax - mul %r15 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r9 - - mov 0x18(%rcx), %rax - mul %r12 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r9 - - mov 0x8(%rcx), %rax - mul %r14 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r9 - - mov 0x10(%rcx), %rax - mul %r13 - add %rax, %r10 - mov %r10, 0x18(%rsp) # c3 - adc %rdx, %r8 - adc \$0x0, %r9 - - xor %r10, %r10 - mov 0x8(%rcx), %rax - mul %r15 - add %rax, %r8 - adc %rdx, %r9 - adc \$0x0,%r10 - - mov 0x18(%rcx), %rax - mul %r13 - add %rax, %r8 - adc %rdx, %r9 - adc \$0x0,%r10 - - mov 0x10(%rcx), %rax - mul %r14 - add %rax, %r8 # c4 - mov %r8, 0x20(%rsp) - adc %rdx, %r9 - adc \$0x0,%r10 - - xor %r11, %r11 - mov 0x10(%rcx), %rax - mul %r15 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r11 - - mov 0x18(%rcx), %rax - mul %r14 - add %rax, %r9 # c5 - mov %r9, 0x28(%rsp) - adc %rdx, %r10 - adc \$0x0,%r11 - - mov 0x18(%rcx), %rax - mul %r15 - add %rax, %r10 # c6 - mov %r10, 0x30(%rsp) - adc %rdx, %r11 # c7 - mov %r11, 0x38(%rsp) - - # r12-r15 <- masked (BH + BL) - mov 0x40(%rsp), %rax - and %rax, %r12 - and %rax, %r13 - and %rax, %r14 - and %rax, %r15 - - # r8-r11 <- masked (AH + AL) - mov 0x48(%rsp),%rax - mov 0x00(rcx), %r8 - and %rax, %r8 - mov 0x08(rcx), %r9 - and %rax, %r9 - mov 0x10(rcx), %r10 - and %rax, %r10 - mov 0x18(rcx), %r11 - and %rax, %r11 - - # r12-r15 <- masked (AH + AL) + masked (BH + BL) - add %r8, %r12 - adc %r9, %r13 - adc %r10, %r14 - adc %r11, %r15 - - # rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high - mov 0x20(%rsp), %rax - add %rax, %r12 - mov 0x28(%rsp), %rax - adc %rax, %r13 - mov 0x30(%rsp), %rax - adc %rax, %r14 - mov 0x38(%rsp), %rax - adc %rax, %r15 - mov %r12, 0x50(%rsp) - mov %r13, 0x58(%rsp) - mov %r14, 0x60(%rsp) - mov %r15, 0x68(%rsp) - - # [rcx] <- CL = AL x BL - mov (%rdi), %r11 - mov (%rsi), %rax - mul %r11 - xor %r9, %r9 - mov %rax, (%rcx) # c0 - mov %rdx, %r8 - - mov 0x10(%rdi), %r14 - mov 0x8(%rsi), %rax - mul %r11 - xor %r10, %r10 - add %rax, %r8 - adc %rdx, %r9 - - mov 0x8(%rdi), %r12 - mov (%rsi), %rax - mul %r12 - add %rax, %r8 - mov %r8, 0x8(%rcx) # c1 - adc %rdx, %r9 - adc \$0x0,%r10 - - xor %r8, %r8 - mov 0x10(%rsi), %rax - mul %r11 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov (%rsi),%r13 - mov %r14, %rax - mul %r13 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x8(%rsi), %rax - mul %r12 - add %rax, %r9 - mov %r9, 0x10(%rcx) # c2 - adc %rdx, %r10 - adc \$0x0,%r8 - - xor %r9, %r9 - mov 0x18(%rsi), %rax - mul %r11 - mov 0x18(%rdi), %r15 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r9 - - mov %r15, %rax - mul %r13 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r9 - - mov 0x10(%rsi), %rax - mul %r12 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r9 - - mov 0x8(%rsi), %rax - mul %r14 - add %rax, %r10 - mov %r10, 0x18(%rcx) # c3 - adc %rdx, %r8 - adc \$0x0,%r9 - - xor %r10, %r10 - mov 0x18(%rsi), %rax - mul %r12 - add %rax, %r8 - adc %rdx, %r9 - adc \$0x0,%r10 - - mov 0x8(%rsi), %rax - mul %r15 - add %rax, %r8 - adc %rdx, %r9 - adc \$0x0,%r10 - - mov 0x10(%rsi), %rax - mul %r14 - add %rax, %r8 - mov %r8, 0x20(%rcx) # c4 - adc %rdx, %r9 - adc \$0x0,%r10 - - xor %r8, %r8 - mov 0x18(%rsi), %rax - mul %r14 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x10(%rsi), %rax - mul %r15 - add %rax, %r9 - mov %r9, 0x28(%rcx) # c5 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x18(%rsi), %rax - mul %r15 - add %rax, %r10 - mov %r10, 0x30(%rcx) # c6 - adc %rdx, %r8 - mov %r8, 0x38(%rcx) # c7 - - # rcx[0x40-0x68] <- AH*BH - # multiplies 2 192-bit numbers A,B - mov 0x20(%rdi), %r11 - mov 0x20(%rsi), %rax - mul %r11 - xor %r9, %r9 - mov %rax, 0x40(%rcx) # c0 - mov %rdx, %r8 - - mov 0x30(%rdi), %r14 - mov 0x28(%rsi), %rax - mul %r11 - xor %r10, %r10 - add %rax, %r8 - adc %rdx, %r9 - - mov 0x28(%rdi), %r12 - mov 0x20(%rsi), %rax - mul %r12 - add %rax, %r8 - mov %r8, 0x48(%rcx) # c1 - adc %rdx, %r9 - adc \$0x0,%r10 - - xor %r8, %r8 - mov 0x30(%rsi), %rax - mul %r11 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x20(%rsi), %r13 - mov %r14, %rax - mul %r13 - add %rax, %r9 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x28(%rsi), %rax - mul %r12 - add %rax, %r9 - mov %r9, 0x50(%rcx) # c2 - adc %rdx, %r10 - adc \$0x0,%r8 - - mov 0x30(%rsi), %rax - mul %r12 - xor %r12, %r12 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r12 - - mov 0x28(%rsi), %rax - mul %r14 - add %rax, %r10 - adc %rdx, %r8 - adc \$0x0,%r12 - mov %r10, 0x58(%rcx) # c3 - - mov 0x30(%rsi), %rax - mul %r14 - add %rax, %r8 - adc \$0x0,%r12 - mov %r8, 0x60(%rcx) # c4 - - add %r12, %rdx # c5 - - # [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL - mov 0x0(%rsp), %r8 - sub 0x0(%rcx), %r8 - mov 0x8(%rsp), %r9 - sbb 0x8(%rcx), %r9 - mov 0x10(%rsp), %r10 - sbb 0x10(%rcx), %r10 - mov 0x18(%rsp), %r11 - sbb 0x18(%rcx), %r11 - mov 0x50(%rsp), %r12 - sbb 0x20(%rcx), %r12 - mov 0x58(%rsp), %r13 - sbb 0x28(%rcx), %r13 - mov 0x60(%rsp), %r14 - sbb 0x30(%rcx), %r14 - mov 0x68(%rsp), %r15 - sbb 0x38(%rcx), %r15 - - # [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH - mov 0x40(%rcx), %rax - sub %rax, %r8 - mov 0x48(%rcx), %rax - sbb %rax, %r9 - mov 0x50(%rcx), %rax - sbb %rax, %r10 - mov 0x58(%rcx), %rax - sbb %rax, %r11 - mov 0x60(%rcx), %rax - sbb %rax, %r12 - sbb %rdx, %r13 - sbb \$0x0,%r14 - sbb \$0x0,%r15 - - # Final result - add 0x20(%rcx), %r8 - mov %r8, 0x20(%rcx) # Result C4-C7 - adc 0x28(%rcx), %r9 - mov %r9, 0x28(%rcx) - adc 0x30(%rcx), %r10 - mov %r10, 0x30(%rcx) - adc 0x38(%rcx), %r11 - mov %r11, 0x38(%rcx) - adc 0x40(%rcx), %r12 - mov %r12, 0x40(%rcx) # Result C8-C13 - adc 0x48(%rcx), %r13 - mov %r13, 0x48(%rcx) - adc 0x50(%rcx), %r14 - mov %r14, 0x50(%rcx) - adc 0x58(%rcx), %r15 - mov %r15, 0x58(%rcx) - mov 0x60(%rcx), %r12 - adc \$0x0, %r12 - mov %r12, 0x60(%rcx) - adc \$0x0, %rdx - mov %rdx, 0x68(%rcx) - - add \$112, %rsp # Restoring space in stack - .cfi_adjust_cfa_offset -112 -___ - - return $code; -} - -# Integer multiplication based on Karatsuba method -# Operation: c [rdx] = a [rdi] * b [rsi] -# NOTE: a=c or b=c are not allowed -sub sike_mul { - my $jump_mul_bdw=&alt_impl(".Lmul_bdw") if ($bmi2_adx); - # MUL for Broadwell CPUs - my $mul_bdw=&mul_bdw() if ($bmi2_adx); - # MUL for CPUs older than Broadwell - my $mul=&mul(); - - my $body=<<___; - .Lmul_bdw: - .cfi_startproc - # sike_mpmul has already pushed r12--15 by this point. - .cfi_adjust_cfa_offset 32 - .cfi_offset r12, -16 - .cfi_offset r13, -24 - .cfi_offset r14, -32 - .cfi_offset r15, -40 - - $mul_bdw - - pop %r15 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r15 - pop %r14 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r14 - pop %r13 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r13 - pop %r12 - .cfi_adjust_cfa_offset -8 - .cfi_same_value r12 - ret - .cfi_endproc - - .globl ${PREFIX}_mpmul - .type ${PREFIX}_mpmul,\@function,3 - ${PREFIX}_mpmul: - .cfi_startproc - push %r12 - .cfi_adjust_cfa_offset 8 - .cfi_offset r12, -16 - push %r13 - .cfi_adjust_cfa_offset 8 - .cfi_offset r13, -24 - push %r14 - .cfi_adjust_cfa_offset 8 - .cfi_offset r14, -32 - push %r15 - .cfi_adjust_cfa_offset 8 - .cfi_offset r15, -40 - - # Jump to optimized implementation if - # CPU supports ADCX/ADOX/MULX - $jump_mul_bdw - # Otherwise use generic implementation - $mul - - pop %r15 - .cfi_adjust_cfa_offset -8 - pop %r14 - .cfi_adjust_cfa_offset -8 - pop %r13 - .cfi_adjust_cfa_offset -8 - pop %r12 - .cfi_adjust_cfa_offset -8 - ret - .cfi_endproc - -___ - return $body; -} - -$code.=&sike_mul(); - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - print $_,"\n"; -} - -close STDOUT; |