diff options
Diffstat (limited to 'src/internal/chacha8rand/chacha8_arm64.s')
-rw-r--r-- | src/internal/chacha8rand/chacha8_arm64.s | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/src/internal/chacha8rand/chacha8_arm64.s b/src/internal/chacha8rand/chacha8_arm64.s new file mode 100644 index 0000000000..18e34dd148 --- /dev/null +++ b/src/internal/chacha8rand/chacha8_arm64.s @@ -0,0 +1,104 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// QR is the ChaCha quarter-round on A, B, C, and D. +// V30 is used as a temporary, and V31 is assumed to +// hold the index table for rotate left 8. +#define QR(A, B, C, D) \ + VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16; VREV32 D.H8, D.H8; \ + VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $12, V30.S4, B.S4; VSRI $20, V30.S4, B.S4 \ + VADD A.S4, B.S4, A.S4; VEOR D.B16, A.B16, D.B16; VTBL V31.B16, [D.B16], D.B16; \ + VADD C.S4, D.S4, C.S4; VEOR B.B16, C.B16, V30.B16; VSHL $7, V30.S4, B.S4; VSRI $25, V30.S4, B.S4 + +// block runs 4 ChaCha8 block transformations in the four stripes of the V registers. + +// func block(seed *[8]uint32, blocks *[4][16]uint32, counter uint32) +TEXT ·block<ABIInternal>(SB), NOSPLIT, $16 + // seed in R0 + // blocks in R1 + // counter in R2 + + // Load initial constants into top row. + MOVD $·chachaConst(SB), R10 + VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] + + // Load increment and rotate 8 constants into V30, V31. + MOVD $·chachaIncRot(SB), R11 + VLD1 (R11), [V30.S4, V31.S4] + + VLD4R.P 16(R0), [V4.S4, V5.S4, V6.S4, V7.S4] + VLD4R.P 16(R0), [V8.S4, V9.S4, V10.S4, V11.S4] + + // store counter to memory to replicate its uint32 halfs back out + MOVW R2, 0(RSP) + VLD1R 0(RSP), [V12.S4] + + // Add 0, 1, 2, 3 to counter stripes. + VADD V30.S4, V12.S4, V12.S4 + + // Zeros for remaining two matrix entries. + VEOR V13.B16, V13.B16, V13.B16 + VEOR V14.B16, V14.B16, V14.B16 + VEOR V15.B16, V15.B16, V15.B16 + + // Save seed state for adding back later. + VMOV V4.B16, V20.B16 + VMOV V5.B16, V21.B16 + VMOV V6.B16, V22.B16 + VMOV V7.B16, V23.B16 + VMOV V8.B16, V24.B16 + VMOV V9.B16, V25.B16 + VMOV V10.B16, V26.B16 + VMOV V11.B16, V27.B16 + + // 4 iterations. Each iteration is 8 quarter-rounds. + MOVD $4, R0 +loop: + QR(V0, V4, V8, V12) + QR(V1, V5, V9, V13) + QR(V2, V6, V10, V14) + QR(V3, V7, V11, V15) + + QR(V0, V5, V10, V15) + QR(V1, V6, V11, V12) + QR(V2, V7, V8, V13) + QR(V3, V4, V9, V14) + + SUB $1, R0 + CBNZ R0, loop + + // Add seed back. + VADD V4.S4, V20.S4, V4.S4 + VADD V5.S4, V21.S4, V5.S4 + VADD V6.S4, V22.S4, V6.S4 + VADD V7.S4, V23.S4, V7.S4 + VADD V8.S4, V24.S4, V8.S4 + VADD V9.S4, V25.S4, V9.S4 + VADD V10.S4, V26.S4, V10.S4 + VADD V11.S4, V27.S4, V11.S4 + + // Store interlaced blocks back to output buffer. + VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R1) + VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R1) + VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R1) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R1) + RET + +GLOBL ·chachaConst(SB), NOPTR|RODATA, $32 +DATA ·chachaConst+0x00(SB)/4, $0x61707865 +DATA ·chachaConst+0x04(SB)/4, $0x3320646e +DATA ·chachaConst+0x08(SB)/4, $0x79622d32 +DATA ·chachaConst+0x0c(SB)/4, $0x6b206574 + +GLOBL ·chachaIncRot(SB), NOPTR|RODATA, $32 +DATA ·chachaIncRot+0x00(SB)/4, $0x00000000 +DATA ·chachaIncRot+0x04(SB)/4, $0x00000001 +DATA ·chachaIncRot+0x08(SB)/4, $0x00000002 +DATA ·chachaIncRot+0x0c(SB)/4, $0x00000003 +DATA ·chachaIncRot+0x10(SB)/4, $0x02010003 +DATA ·chachaIncRot+0x14(SB)/4, $0x06050407 +DATA ·chachaIncRot+0x18(SB)/4, $0x0A09080B +DATA ·chachaIncRot+0x1c(SB)/4, $0x0E0D0C0F |