src/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287

#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================

# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
# implements the multiplication algorithm described in:
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
#
# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
# NEON, the low and high halves of the 128-bit register q0 are accessible as
# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
# vN. Where the 32-bit version would use the upper half, this file must keep
# halves in separate registers.
#
# The other distinction is in syntax. 32-bit NEON embeds lane information in the
# instruction name, while AArch64 uses suffixes on the registers. For instance,
# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
#
#     vshl.i64 q0, q0, #1
#
# in 64-bit, it would be written:
#
#     shl v0.2d, v0.2d, #1
#
# See Programmer's Guide for ARMv8-A, section 7 for details.
# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
#
# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
# polynomial and is conditioned on the PMULL extension. This file emulates the
# latter with the former.

use strict;

my $flavour = shift;
my $output;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
    my $dir = $1;
    my $xlate;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
# to spare.
my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
my ($k48_k32, $k16_k0) = map("v$_", (24..25));

my $code = "";

# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
sub clmul64x64 {
my ($r, $a, $b) = @_;
$code .= <<___;
	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
	ext	$r.8b, $b.8b, $b.8b, #1		// B1
	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
	ext	$r.8b, $b.8b, $b.8b, #3		// B3
	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3

	// Here we diverge from the 32-bit version. It computes the following
	// (instructions reordered for clarity):
	//
	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
	//     vand	\$t0#hi, \$t0#hi, \$k48
	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
	//
	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
	//     vand	\$t1#hi, \$t1#hi, \$k32
	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
	//
	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
	//     vand	\$t2#hi, \$t2#hi, \$k16
	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
	//
	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
	//     vmov.i64	\$t3#hi, #0
	//
	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
	// upper halves of SIMD registers, so we must split each half into
	// separate registers. To compensate, we pair computations up and
	// parallelize.

	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4

	// This can probably be scheduled more efficiently. For now, we just
	// pair up independent instructions.
	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d

	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
	eor	$t0.16b, $t0.16b, $t1.16b
	eor	$t2.16b, $t2.16b, $t3.16b
	eor	$r.16b, $r.16b, $t0.16b
	eor	$r.16b, $r.16b, $t2.16b
___
}

$code .= <<___;
.text

.global	gcm_init_neon
.type	gcm_init_neon,%function
.align	4
gcm_init_neon:
	// This function is adapted from gcm_init_v8. xC2 is t3.
	ld1	{$t1.2d}, [x1]			// load H
	movi	$t3.16b, #0xe1
	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
	ext	$INlo.16b, $t1.16b, $t1.16b, #8
	ushr	$t2.2d, $t3.2d, #63
	dup	$t1.4s, $t1.s[1]
	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
	ushr	$t2.2d, $INlo.2d, #63
	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
	and	$t2.16b, $t2.16b, $t0.16b
	shl	$INlo.2d, $INlo.2d, #1
	ext	$t2.16b, $t2.16b, $t2.16b, #8
	and	$t0.16b, $t0.16b, $t1.16b
	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
	st1	{$Hlo.2d}, [x0]			// store Htable[0]
	ret
.size	gcm_init_neon,.-gcm_init_neon

.global	gcm_gmult_neon
.type	gcm_gmult_neon,%function
.align	4
gcm_gmult_neon:
	ld1	{$INlo.16b}, [$Xi]		// load Xi
	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
	ld1	{$Hhi.1d}, [$Htbl]
	adrp	x9, :pg_hi21:.Lmasks		// load constants
	add	x9, x9, :lo12:.Lmasks
	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing

	mov	$len, #16
	b	.Lgmult_neon
.size	gcm_gmult_neon,.-gcm_gmult_neon

.global	gcm_ghash_neon
.type	gcm_ghash_neon,%function
.align	4
gcm_ghash_neon:
	ld1	{$Xl.16b}, [$Xi]		// load Xi
	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
	ld1	{$Hhi.1d}, [$Htbl]
	adrp	x9, :pg_hi21:.Lmasks		// load constants
	add	x9, x9, :lo12:.Lmasks
	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing

.Loop_neon:
	ld1	{$INlo.16b}, [$inp], #16	// load inp
	rev64	$INlo.16b, $INlo.16b		// byteswap inp
	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi

.Lgmult_neon:
	// Split the input into $INlo and $INhi. (The upper halves are unused,
	// so it is okay to leave them alone.)
	ins	$INhi.d[0], $INlo.d[1]
___
&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
$code .= <<___;
	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
___
&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
$code .= <<___;
	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
	eor	$Xm.16b, $Xm.16b, $Xh.16b
	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
	// This is a no-op due to the ins instruction below.
	// ins	$Xh.d[0], $Xm.d[1]

	// equivalent of reduction_avx from ghash-x86_64.pl
	shl	$t1.2d, $Xl.2d, #57		// 1st phase
	shl	$t2.2d, $Xl.2d, #62
	eor	$t2.16b, $t2.16b, $t1.16b	//
	shl	$t1.2d, $Xl.2d, #63
	eor	$t2.16b, $t2.16b, $t1.16b	//
	// Note Xm contains {Xl.d[1], Xh.d[0]}.
	eor	$t2.16b, $t2.16b, $Xm.16b
	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]

	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
	eor	$Xh.16b, $Xh.16b,$Xl.16b
	eor	$Xl.16b, $Xl.16b,$t2.16b	//
	ushr	$t2.2d, $t2.2d, #6
	ushr	$Xl.2d, $Xl.2d, #1		//
	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
	eor	$Xl.16b, $Xl.16b, $t2.16b	//

	subs	$len, $len, #16
	bne	.Loop_neon

	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
	st1	{$Xl.16b}, [$Xi]

	ret
.size	gcm_ghash_neon,.-gcm_ghash_neon

.section	.rodata
.align	4
.Lmasks:
.quad	0x0000ffffffffffff	// k48
.quad	0x00000000ffffffff	// k32
.quad	0x000000000000ffff	// k16
.quad	0x0000000000000000	// k0
.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
.align  2
___

foreach (split("\n",$code)) {
	s/\`([^\`]*)\`/eval $1/geo;

	print $_,"\n";
}
close STDOUT; # enforce flush