summaryrefslogtreecommitdiff
path: root/src/crypto/poly1305/poly1305_vec.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/poly1305/poly1305_vec.c')
-rw-r--r--src/crypto/poly1305/poly1305_vec.c82
1 files changed, 41 insertions, 41 deletions
diff --git a/src/crypto/poly1305/poly1305_vec.c b/src/crypto/poly1305/poly1305_vec.c
index 3045a2f1..80eaa36d 100644
--- a/src/crypto/poly1305/poly1305_vec.c
+++ b/src/crypto/poly1305/poly1305_vec.c
@@ -12,11 +12,11 @@
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-/* This implementation of poly1305 is by Andrew Moon
- * (https://github.com/floodyberry/poly1305-donna) and released as public
- * domain. It implements SIMD vectorization based on the algorithm described in
- * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
- * block size */
+// This implementation of poly1305 is by Andrew Moon
+// (https://github.com/floodyberry/poly1305-donna) and released as public
+// domain. It implements SIMD vectorization based on the algorithm described in
+// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
+// block size
#include <openssl/poly1305.h>
@@ -69,14 +69,14 @@ typedef struct poly1305_state_internal_t {
poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
bytes of free storage */
union {
- xmmi H[5]; /* 80 bytes */
+ xmmi H[5]; // 80 bytes
uint64_t HH[10];
};
- /* uint64_t r0,r1,r2; [24 bytes] */
- /* uint64_t pad0,pad1; [16 bytes] */
- uint64_t started; /* 8 bytes */
- uint64_t leftover; /* 8 bytes */
- uint8_t buffer[64]; /* 64 bytes */
+ // uint64_t r0,r1,r2; [24 bytes]
+ // uint64_t pad0,pad1; [16 bytes]
+ uint64_t started; // 8 bytes
+ uint64_t leftover; // 8 bytes
+ uint8_t buffer[64]; // 64 bytes
} poly1305_state_internal; /* 448 bytes total + 63 bytes for
alignment = 511 bytes raw */
@@ -85,7 +85,7 @@ static inline poly1305_state_internal *poly1305_aligned_state(
return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
}
-/* copy 0-63 bytes */
+// copy 0-63 bytes
static inline void
poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
size_t offset = src - dst;
@@ -117,7 +117,7 @@ poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) {
}
}
-/* zero 0-15 bytes */
+// zero 0-15 bytes
static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) {
if (bytes & 8) {
*(uint64_t *)dst = 0;
@@ -146,7 +146,7 @@ void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
uint64_t r0, r1, r2;
uint64_t t0, t1;
- /* clamp key */
+ // clamp key
t0 = U8TO64_LE(key + 0);
t1 = U8TO64_LE(key + 8);
r0 = t0 & 0xffc0fffffff;
@@ -156,7 +156,7 @@ void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
t1 >>= 24;
r2 = t1 & 0x00ffffffc0f;
- /* store r in un-used space of st->P[1] */
+ // store r in un-used space of st->P[1]
p = &st->P[1];
p->R20.d[1] = (uint32_t)(r0);
p->R20.d[3] = (uint32_t)(r0 >> 32);
@@ -165,13 +165,13 @@ void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) {
p->R22.d[1] = (uint32_t)(r2);
p->R22.d[3] = (uint32_t)(r2 >> 32);
- /* store pad */
+ // store pad
p->R23.d[1] = U8TO32_LE(key + 16);
p->R23.d[3] = U8TO32_LE(key + 20);
p->R24.d[1] = U8TO32_LE(key + 24);
p->R24.d[3] = U8TO32_LE(key + 28);
- /* H = 0 */
+ // H = 0
st->H[0] = _mm_setzero_si128();
st->H[1] = _mm_setzero_si128();
st->H[2] = _mm_setzero_si128();
@@ -196,7 +196,7 @@ static void poly1305_first_block(poly1305_state_internal *st,
uint64_t c;
uint64_t i;
- /* pull out stored info */
+ // pull out stored info
p = &st->P[1];
r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
@@ -205,7 +205,7 @@ static void poly1305_first_block(poly1305_state_internal *st,
pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
- /* compute powers r^2,r^4 */
+ // compute powers r^2,r^4
r20 = r0;
r21 = r1;
r22 = r2;
@@ -249,7 +249,7 @@ static void poly1305_first_block(poly1305_state_internal *st,
p--;
}
- /* put saved info back */
+ // put saved info back
p = &st->P[1];
p->R20.d[1] = (uint32_t)(r0);
p->R20.d[3] = (uint32_t)(r0 >> 32);
@@ -262,7 +262,7 @@ static void poly1305_first_block(poly1305_state_internal *st,
p->R24.d[1] = (uint32_t)(pad1);
p->R24.d[3] = (uint32_t)(pad1 >> 32);
- /* H = [Mx,My] */
+ // H = [Mx,My]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
_mm_loadl_epi64((const xmmi *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
@@ -294,7 +294,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
H4 = st->H[4];
while (bytes >= 64) {
- /* H *= [r^4,r^4] */
+ // H *= [r^4,r^4]
p = &st->P[0];
T0 = _mm_mul_epu32(H0, p->R20.v);
T1 = _mm_mul_epu32(H0, p->R21.v);
@@ -342,7 +342,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
T5 = _mm_mul_epu32(H4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
- /* H += [Mx,My]*[r^2,r^2] */
+ // H += [Mx,My]*[r^2,r^2]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
_mm_loadl_epi64((const xmmi *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
@@ -406,7 +406,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
T5 = _mm_mul_epu32(M4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
- /* H += [Mx,My] */
+ // H += [Mx,My]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
_mm_loadl_epi64((const xmmi *)(m + 48)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
@@ -424,7 +424,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
T3 = _mm_add_epi64(T3, M3);
T4 = _mm_add_epi64(T4, M4);
- /* reduce */
+ // reduce
C1 = _mm_srli_epi64(T0, 26);
C2 = _mm_srli_epi64(T3, 26);
T0 = _mm_and_si128(T0, MMASK);
@@ -447,7 +447,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
T3 = _mm_and_si128(T3, MMASK);
T4 = _mm_add_epi64(T4, C1);
- /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */
+ // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
H0 = T0;
H1 = T1;
H2 = T2;
@@ -488,11 +488,11 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
H3 = st->H[3];
H4 = st->H[4];
- /* p = [r^2,r^2] */
+ // p = [r^2,r^2]
p = &st->P[1];
if (bytes >= 32) {
- /* H *= [r^2,r^2] */
+ // H *= [r^2,r^2]
T0 = _mm_mul_epu32(H0, p->R20.v);
T1 = _mm_mul_epu32(H0, p->R21.v);
T2 = _mm_mul_epu32(H0, p->R22.v);
@@ -539,7 +539,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
T5 = _mm_mul_epu32(H4, p->R20.v);
T4 = _mm_add_epi64(T4, T5);
- /* H += [Mx,My] */
+ // H += [Mx,My]
T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
_mm_loadl_epi64((const xmmi *)(m + 16)));
T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
@@ -557,7 +557,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
T3 = _mm_add_epi64(T3, M3);
T4 = _mm_add_epi64(T4, M4);
- /* reduce */
+ // reduce
C1 = _mm_srli_epi64(T0, 26);
C2 = _mm_srli_epi64(T3, 26);
T0 = _mm_and_si128(T0, MMASK);
@@ -580,7 +580,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
T3 = _mm_and_si128(T3, MMASK);
T4 = _mm_add_epi64(T4, C1);
- /* H = (H*[r^2,r^2] + [Mx,My]) */
+ // H = (H*[r^2,r^2] + [Mx,My])
H0 = T0;
H1 = T1;
H2 = T2;
@@ -590,7 +590,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
consumed = 32;
}
- /* finalize, H *= [r^2,r] */
+ // finalize, H *= [r^2,r]
r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
@@ -605,7 +605,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
p->S23.d[2] = p->R23.d[2] * 5;
p->S24.d[2] = p->R24.d[2] * 5;
- /* H *= [r^2,r] */
+ // H *= [r^2,r]
T0 = _mm_mul_epu32(H0, p->R20.v);
T1 = _mm_mul_epu32(H0, p->R21.v);
T2 = _mm_mul_epu32(H0, p->R22.v);
@@ -674,7 +674,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
T3 = _mm_and_si128(T3, MMASK);
T4 = _mm_add_epi64(T4, C1);
- /* H = H[0]+H[1] */
+ // H = H[0]+H[1]
H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
@@ -713,7 +713,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
poly1305_state_internal *st = poly1305_aligned_state(state);
size_t want;
- /* need at least 32 initial bytes to start the accelerated branch */
+ // need at least 32 initial bytes to start the accelerated branch
if (!st->started) {
if ((st->leftover == 0) && (bytes > 32)) {
poly1305_first_block(st, m);
@@ -734,7 +734,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
st->started = 1;
}
- /* handle leftover */
+ // handle leftover
if (st->leftover) {
want = poly1305_min(64 - st->leftover, bytes);
poly1305_block_copy(st->buffer + st->leftover, m, want);
@@ -748,7 +748,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m,
st->leftover = 0;
}
- /* process 64 byte blocks */
+ // process 64 byte blocks
if (bytes >= 64) {
want = (bytes & ~63);
poly1305_blocks(st, m, want);
@@ -779,7 +779,7 @@ void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) {
m += consumed;
}
- /* st->HH will either be 0 or have the combined result */
+ // st->HH will either be 0 or have the combined result
h0 = st->HH[0];
h1 = st->HH[1];
h2 = st->HH[2];
@@ -826,7 +826,7 @@ poly1305_donna_mul:
goto poly1305_donna_atleast16bytes;
}
-/* final bytes */
+// final bytes
poly1305_donna_atmost15bytes:
if (!leftover) {
goto poly1305_donna_finish;
@@ -870,7 +870,7 @@ poly1305_donna_finish:
h1 = (h1 & nc) | (g1 & c);
h2 = (h2 & nc) | (g2 & c);
- /* pad */
+ // pad
t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
h0 += (t0 & 0xfffffffffff);
@@ -887,4 +887,4 @@ poly1305_donna_finish:
U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24)));
}
-#endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */
+#endif // !OPENSSL_WINDOWS && OPENSSL_X86_64