diff options
Diffstat (limited to 'src/third_party/sike/asm/fp-armv8.pl')
-rw-r--r-- | src/third_party/sike/asm/fp-armv8.pl | 915 |
1 files changed, 915 insertions, 0 deletions
diff --git a/src/third_party/sike/asm/fp-armv8.pl b/src/third_party/sike/asm/fp-armv8.pl new file mode 100644 index 00000000..ce19d809 --- /dev/null +++ b/src/third_party/sike/asm/fp-armv8.pl @@ -0,0 +1,915 @@ +#! /usr/bin/env perl +# +# April 2019 +# +# Abstract: field arithmetic in aarch64 assembly for SIDH/p434 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$PREFIX="sike"; + +$code.=<<___; +.section .rodata + +# p434 x 2 +.Lp434x2: + .quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF + .quad 0xFB82ECF5C5FFFFFF, 0xF78CB8F062B15D47 + .quad 0xD9F8BFAD038A40AC, 0x0004683E4E2EE688 + +# p434 + 1 +.Lp434p1: + .quad 0xFDC1767AE3000000, 0x7BC65C783158AEA3 + .quad 0x6CFC5FD681C52056, 0x0002341F27177344 + +.text +___ + +# Computes C0-C2 = A0 * (B0-B1) +# Inputs remain intact +sub mul64x128 { + my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_; + my $body=<<___; + mul $T1, $A0, $B0 + umulh $B0, $A0, $B0 + adds $C0, $C0, $C2 + adc $C1, $C1, xzr + + mul $T0, $A0, $B1 + umulh $B1, $A0, $B1 + adds $C0, $C0, $T1 + adcs $C1, $C1, $B0 + adc $C2, xzr, xzr + + adds $C1, $C1, $T0 + adc $C2, $C2, $B1 +___ + return $body; +} + +# Computes C0-C4 = A0 * (B0-B3) +# Inputs remain intact +sub mul64x256 { + my ($A0,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2)=@_; + my $body=<<___; + mul $C0, $A0, $B0 // C0 + umulh $T0, $A0, $B0 + + mul $C1, $A0, $B1 + umulh $T1, $A0, $B1 + adds $C1, $C1, $T0 // C1 + adc $T0, xzr, xzr + + mul $C2, $A0, $B2 + umulh $T2, $A0, $B2 + adds $T1, $T0, $T1 + adcs $C2, $C2, $T1 // C2 + adc $T0, xzr, xzr + + mul $C3, $A0, $B3 + umulh $C4, $A0, $B3 + adds $T2, $T0, $T2 + adcs $C3, $C3, $T2 // C3 + adc $C4, $C4, xzr // C4 +___ + return $body; +} + +# Computes C0-C4 = (A0-A1) * (B0-B3) +# Inputs remain intact +sub mul128x256 { + my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_; + my $body=<<___; + mul $C0, $A0, $B0 // C0 + umulh $C3, $A0, $B0 + + mul $C1, $A0, $B1 + umulh $C2, $A0, $B1 + + mul $T0, $A1, $B0 + umulh $T1, $A1, $B0 + adds $C1, $C1, $C3 + adc $C2, $C2, xzr + + mul $T2, $A0, $B2 + umulh $T3, $A0, $B2 + adds $C1, $C1, $T0 // C1 + adcs $C2, $C2, $T1 + adc $C3, xzr, xzr + + mul $T0, $A1, $B1 + umulh $T1, $A1, $B1 + adds $C2, $C2, $T2 + adcs $C3, $C3, $T3 + adc $C4, xzr, xzr + + mul $T2, $A0, $B3 + umulh $T3, $A0, $B3 + adds $C2, $C2, $T0 // C2 + adcs $C3, $C3, $T1 + adc $C4, $C4, xzr + + mul $T0, $A1, $B2 + umulh $T1, $A1, $B2 + adds $C3, $C3, $T2 + adcs $C4, $C4, $T3 + adc $C5, xzr, xzr + + mul $T2, $A1, $B3 + umulh $T3, $A1, $B3 + adds $C3, $C3, $T0 // C3 + adcs $C4, $C4, $T1 + adc $C5, $C5, xzr + adds $C4, $C4, $T2 // C4 + adc $C5, $C5, $T3 // C5 + +___ + return $body; +} + +# Computes C0-C5 = (A0-A2) * (B0-B2) +# Inputs remain intact +sub mul192 { + my ($A0,$A1,$A2,$B0,$B1,$B2,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_; + my $body=<<___; + + // A0 * B0 + mul $C0, $A0, $B0 // C0 + umulh $C3, $A0, $B0 + + // A0 * B1 + mul $C1, $A0, $B1 + umulh $C2, $A0, $B1 + + // A1 * B0 + mul $T0, $A1, $B0 + umulh $T1, $A1, $B0 + adds $C1, $C1, $C3 + adc $C2, $C2, xzr + + // A0 * B2 + mul $T2, $A0, $B2 + umulh $T3, $A0, $B2 + adds $C1, $C1, $T0 // C1 + adcs $C2, $C2, $T1 + adc $C3, xzr, xzr + + // A2 * B0 + mul $T0, $A2, $B0 + umulh $C4, $A2, $B0 + adds $C2, $C2, $T2 + adcs $C3, $C3, $C4 + adc $C4, xzr, xzr + + // A1 * B1 + mul $T2, $A1, $B1 + umulh $T1, $A1, $B1 + adds $C2, $C2, $T0 + adcs $C3, $C3, $T3 + adc $C4, $C4, xzr + + // A1 * B2 + mul $T0, $A1, $B2 + umulh $T3, $A1, $B2 + adds $C2, $C2, $T2 // C2 + adcs $C3, $C3, $T1 + adc $C4, $C4, xzr + + // A2 * B1 + mul $T2, $A2, $B1 + umulh $T1, $A2, $B1 + adds $C3, $C3, $T0 + adcs $C4, $C4, $T3 + adc $C5, xzr, xzr + + // A2 * B2 + mul $T0, $A2, $B2 + umulh $T3, $A2, $B2 + adds $C3, $C3, $T2 // C3 + adcs $C4, $C4, $T1 + adc $C5, $C5, xzr + + adds $C4, $C4, $T0 // C4 + adc $C5, $C5, $T3 // C5 +___ + return $body; +} +sub mul256_karatsuba { + my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_; + # (AH+AL) x (BH+BL), low part + my $mul_low=&mul64x128($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0); + # AL x BL + my $mul_albl=&mul64x128($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0); + # AH x BH + my $mul_ahbh=&mul64x128($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2); + my $body=<<___; + // A0-A1 <- AH + AL, T0 <- mask + adds $A0, $A0, $A2 + adcs $A1, $A1, $A3 + adc $T0, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds $C6, $B0, $B2 + adcs $T1, $B1, $B3 + adc $C7, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub $C2, xzr, $T0 + sub $C3, xzr, $C7 + and $C0, $C6, $C2 + and $C1, $T1, $C2 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and $C4, $A0, $C3 + and $C5, $A1, $C3 + mul $C2, $A0, $C6 + mul $C3, $A0, $T1 + and $T0, $T0, $C7 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds $C0, $C4, $C0 + umulh $C4, $A0, $T1 + adcs $C1, $C5, $C1 + umulh $C5, $A0, $C6 + adc $T0, $T0, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + $mul_low + ldp $A0, $A1, [$M,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds $C4, $C0, $C4 + umulh $C7, $A0, $B0 + umulh $T1, $A0, $B1 + adcs $C5, $C1, $C5 + mul $C0, $A0, $B0 + mul $C1, $A0, $B1 + adc $T0, $T0, xzr + + // C0-C1, T1, C7 <- AL x BL + $mul_albl + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul $A0, $A2, $B2 + umulh $B0, $A2, $B2 + subs $C2, $C2, $C0 + sbcs $C3, $C3, $C1 + sbcs $C4, $C4, $T1 + mul $A1, $A2, $B3 + umulh $C6, $A2, $B3 + sbcs $C5, $C5, $C7 + sbc $T0, $T0, xzr + + // A0, A1, C6, B0 <- AH x BH + $mul_ahbh + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs $C2, $C2, $A0 + sbcs $C3, $C3, $A1 + sbcs $C4, $C4, $C6 + sbcs $C5, $C5, $B0 + sbc $T0, $T0, xzr + + adds $C2, $C2, $T1 + adcs $C3, $C3, $C7 + adcs $C4, $C4, $A0 + adcs $C5, $C5, $A1 + adcs $C6, $T0, $C6 + adc $C7, $B0, xzr +___ + return $body; +} + +# 512-bit integer multiplication using Karatsuba (two levels), +# Comba (lower level). +# Operation: c [x2] = a [x0] * b [x1] +sub mul { + # (AH+AL) x (BH+BL), low part + my $mul_kc_low=&mul256_karatsuba( + "x2", # M0 + "x3","x4","x5","x6", # A0-A3 + "x10","x11","x12","x13", # B0-B3 + "x8","x9","x19","x20","x21","x22","x23","x24", # C0-C7 + "x25","x26"); # TMP + # AL x BL + my $mul_albl=&mul256_karatsuba( + "x0", # M0f + "x3","x4","x5","x6", # A0-A3 + "x10","x11","x12","x13", # B0-B3 + "x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7 + "x8","x9"); # TMP + # AH x BH + my $mul_ahbh=&mul192( + "x3","x4","x5", # A0-A2 + "x10","x11","x12", # B0-B2 + "x21","x22","x23","x24","x25","x26", # C0-C5 + "x8","x9","x27","x28"); # TMP + + my $body=<<___; + .global ${PREFIX}_mpmul + .align 4 + ${PREFIX}_mpmul: + stp x29, x30, [sp,#-96]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + ldp x14, x15, [x1,#32] + ldr x16, [x1,#48] + + // x3-x7 <- AH + AL, x7 <- carry + adds x3, x3, x7 + adcs x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, xzr + adc x7, xzr, xzr + + // x10-x13 <- BH + BL, x8 <- carry + adds x10, x10, x14 + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, xzr + adc x8, xzr, xzr + + // x9 <- combined carry + and x9, x7, x8 + // x7-x8 <- mask + sub x7, xzr, x7 + sub x8, xzr, x8 + + // x15-x19 <- masked (BH + BL) + and x14, x10, x7 + and x15, x11, x7 + and x16, x12, x7 + and x17, x13, x7 + + // x20-x23 <- masked (AH + AL) + and x20, x3, x8 + and x21, x4, x8 + and x22, x5, x8 + and x23, x6, x8 + + // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x14, x14, x20 + adcs x15, x15, x21 + adcs x16, x16, x22 + adcs x17, x17, x23 + adc x7, x9, xzr + + // x8-x9,x19,x20-x24 <- (AH+AL) x (BH+BL), low part + stp x3, x4, [x2,#0] + $mul_kc_low + + // x15-x19, x7 <- (AH+AL) x (BH+BL), final step + adds x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adc x7, x7, xzr + + // Load AL + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + // Load BL + ldp x10, x11, [x1,#0] + ldp x12, x13, [x1,#16] + + // Temporarily store x8 in x2 + stp x8, x9, [x2,#0] + // x21-x28 <- AL x BL + $mul_albl + // Restore x8 + ldp x8, x9, [x2,#0] + + // x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, x27 + sbcs x17, x17, x28 + sbc x7, x7, xzr + + // Store ALxBL, low + stp x21, x22, [x2] + stp x23, x24, [x2,#16] + + // Load AH + ldp x3, x4, [x0,#32] + ldr x5, [x0,#48] + // Load BH + ldp x10, x11, [x1,#32] + ldr x12, [x1,#48] + + adds x8, x8, x25 + adcs x9, x9, x26 + adcs x19, x19, x27 + adcs x20, x20, x28 + adc x1, xzr, xzr + + add x0, x0, #32 + // Temporarily store x8,x9 in x2 + stp x8,x9, [x2,#32] + // x21-x28 <- AH x BH + $mul_ahbh + // Restore x8,x9 + ldp x8,x9, [x2,#32] + + neg x1, x1 + + // x8-x9,x19,x20,x14-x17 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x21 + sbcs x9, x9, x22 + sbcs x19, x19, x23 + sbcs x20, x20, x24 + sbcs x14, x14, x25 + sbcs x15, x15, x26 + sbcs x16, x16, xzr + sbcs x17, x17, xzr + sbc x7, x7, xzr + + // Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low + stp x8, x9, [x2,#32] + stp x19, x20, [x2,#48] + + adds x1, x1, #1 + adcs x14, x14, x21 + adcs x15, x15, x22 + adcs x16, x16, x23 + adcs x17, x17, x24 + adcs x25, x7, x25 + adc x26, x26, xzr + + stp x14, x15, [x2,#64] + stp x16, x17, [x2,#80] + stp x25, x26, [x2,#96] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +___ + return $body; +} +$code.=&mul(); + +# Montgomery reduction +# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 +# Operation: mc [x1] = ma [x0] +# NOTE: ma=mc is not allowed +sub rdc { + my $mul01=&mul128x256( + "x2","x3", # A0-A1 + "x23","x24","x25","x26", # B0-B3 + "x4","x5","x6","x7","x8","x9", # C0-C5 + "x10","x11","x27","x28"); # TMP + my $mul23=&mul128x256( + "x2","x10", # A0-A1 + "x23","x24","x25","x26", # B0-B3 + "x4","x5","x6","x7","x8","x9", # C0-C5 + "x0","x3","x27","x28"); # TMP + my $mul45=&mul128x256( + "x11","x12", # A0-A1 + "x23","x24","x25","x26", # B0-B3 + "x4","x5","x6","x7","x8","x9", # C0-C5 + "x10","x3","x27","x28"); # TMP + my $mul67=&mul64x256( + "x13", # A0 + "x23","x24","x25","x26", # B0-B3 + "x4","x5","x6","x7","x8", # C0-C4 + "x10","x27","x28"); # TMP + my $body=<<___; + .global ${PREFIX}_fprdc + .align 4 + ${PREFIX}_fprdc: + stp x29, x30, [sp, #-96]! + add x29, sp, xzr + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + adrp x26, :pg_hi21:.Lp434p1 + add x26, x26, :lo12:.Lp434p1 + ldp x23, x24, [x26, #0x0] + ldp x25, x26, [x26,#0x10] + + // a[0-1] * p434+1 + $mul01 + + ldp x10, x11, [x0, #0x18] + ldp x12, x13, [x0, #0x28] + ldp x14, x15, [x0, #0x38] + ldp x16, x17, [x0, #0x48] + ldp x19, x20, [x0, #0x58] + ldr x21, [x0, #0x68] + + adds x10, x10, x4 + adcs x11, x11, x5 + adcs x12, x12, x6 + adcs x13, x13, x7 + adcs x14, x14, x8 + adcs x15, x15, x9 + adcs x22, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + ldr x2, [x0,#0x10] // a[2] + // a[2-3] * p434+1 + $mul23 + + adds x12, x12, x4 + adcs x13, x13, x5 + adcs x14, x14, x6 + adcs x15, x15, x7 + adcs x16, x22, x8 + adcs x17, x17, x9 + adcs x22, x19, xzr + adcs x20, x20, xzr + adc x21, x21, xzr + + $mul45 + adds x14, x14, x4 + adcs x15, x15, x5 + adcs x16, x16, x6 + adcs x17, x17, x7 + adcs x19, x22, x8 + adcs x20, x20, x9 + adc x22, x21, xzr + + stp x14, x15, [x1, #0x0] // C0, C1 + + $mul67 + adds x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + adcs x20, x20, x7 + adc x21, x22, x8 + + str x16, [x1, #0x10] + stp x17, x19, [x1, #0x18] + stp x20, x21, [x1, #0x28] + + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldp x29, x30, [sp],#96 + ret +___ +} +$code.=&rdc(); + +# Field addition +# Operation: c [x2] = a [x0] + b [x1] +$code.=<<___; + .global ${PREFIX}_fpadd + .align 4 + ${PREFIX}_fpadd: + stp x29,x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + // Subtract 2xp434 + adrp x17, :pg_hi21:.Lp434x2 + add x17, x17, :lo12:.Lp434x2 + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbc x0, xzr, xzr // x0 can be reused now + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +___ + +# Field subtraction +# Operation: c [x2] = a [x0] - b [x1] +$code.=<<___; + .global ${PREFIX}_fpsub + .align 4 + ${PREFIX}_fpsub: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + // Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbc x0, xzr, xzr + + // Add 2xp434 anded with the mask in x0 + adrp x17, :pg_hi21:.Lp434x2 + add x17, x17, :lo12:.Lp434x2 + + // First half + ldp x11, x12, [x17, #0] + ldp x13, x14, [x17, #16] + ldp x15, x16, [x17, #32] + + // Add 2xp434 anded with the mask in x0 + and x11, x11, x0 + and x12, x12, x0 + and x13, x13, x0 + and x14, x14, x0 + and x15, x15, x0 + and x16, x16, x0 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adc x9, x9, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +___ + +# 434-bit multiprecision addition +# Operation: c [x2] = a [x0] + b [x1] +$code.=<<___; + .global ${PREFIX}_mpadd_asm + .align 4 + ${PREFIX}_mpadd_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldr x9, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldr x17, [x1,#48] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + adcs x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + str x9, [x2,#48] + + ldp x29, x30, [sp],#16 + ret +___ + +# 2x434-bit multiprecision subtraction +# Operation: c [x2] = a [x0] - b [x1]. +# Returns borrow mask +$code.=<<___; + .global ${PREFIX}_mpsubx2_asm + .align 4 + ${PREFIX}_mpsubx2_asm: + stp x29, x30, [sp,#-16]! + add x29, sp, #0 + + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbcs x9, x9, x13 + sbcs x10, x10, x14 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x11, x12, [x1,#96] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +___ + + +# Double 2x434-bit multiprecision subtraction +# Operation: c [x2] = c [x2] - a [x0] - b [x1] +$code.=<<___; + .global ${PREFIX}_mpdblsubx2_asm + .align 4 + ${PREFIX}_mpdblsubx2_asm: + stp x29, x30, [sp, #-16]! + add x29, sp, #0 + + ldp x3, x4, [x2, #0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + + ldp x11, x12, [x0, #0] + ldp x13, x14, [x0,#16] + ldp x15, x16, [x0,#32] + + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + + // x9 stores carry + adc x9, xzr, xzr + + ldp x11, x12, [x1, #0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2, #0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + + ldp x3, x4, [x2,#48] + ldp x5, x6, [x2,#64] + ldp x7, x8, [x2,#80] + + ldp x11, x12, [x0,#48] + ldp x13, x14, [x0,#64] + ldp x15, x16, [x0,#80] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, xzr, xzr + + ldp x11, x12, [x1,#48] + ldp x13, x14, [x1,#64] + ldp x15, x16, [x1,#80] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + sbcs x7, x7, x15 + sbcs x8, x8, x16 + adc x9, x9, xzr + + stp x3, x4, [x2,#48] + stp x5, x6, [x2,#64] + stp x7, x8, [x2,#80] + + ldp x3, x4, [x2,#96] + ldp x11, x12, [x0,#96] + ldp x13, x14, [x1,#96] + + // x9 = 2 - x9 + neg x9, x9 + add x9, x9, #2 + + subs x3, x3, x9 + sbcs x3, x3, x11 + sbcs x4, x4, x12 + subs x3, x3, x13 + sbc x4, x4, x14 + stp x3, x4, [x2,#96] + + ldp x29, x30, [sp],#16 + ret +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT; |