diff options
Diffstat (limited to 'src/crypto/poly1305/poly1305_vec.c')
-rw-r--r-- | src/crypto/poly1305/poly1305_vec.c | 82 |
1 files changed, 41 insertions, 41 deletions
diff --git a/src/crypto/poly1305/poly1305_vec.c b/src/crypto/poly1305/poly1305_vec.c index 3045a2f1..80eaa36d 100644 --- a/src/crypto/poly1305/poly1305_vec.c +++ b/src/crypto/poly1305/poly1305_vec.c @@ -12,11 +12,11 @@ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* This implementation of poly1305 is by Andrew Moon - * (https://github.com/floodyberry/poly1305-donna) and released as public - * domain. It implements SIMD vectorization based on the algorithm described in - * http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte - * block size */ +// This implementation of poly1305 is by Andrew Moon +// (https://github.com/floodyberry/poly1305-donna) and released as public +// domain. It implements SIMD vectorization based on the algorithm described in +// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte +// block size #include <openssl/poly1305.h> @@ -69,14 +69,14 @@ typedef struct poly1305_state_internal_t { poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 bytes of free storage */ union { - xmmi H[5]; /* 80 bytes */ + xmmi H[5]; // 80 bytes uint64_t HH[10]; }; - /* uint64_t r0,r1,r2; [24 bytes] */ - /* uint64_t pad0,pad1; [16 bytes] */ - uint64_t started; /* 8 bytes */ - uint64_t leftover; /* 8 bytes */ - uint8_t buffer[64]; /* 64 bytes */ + // uint64_t r0,r1,r2; [24 bytes] + // uint64_t pad0,pad1; [16 bytes] + uint64_t started; // 8 bytes + uint64_t leftover; // 8 bytes + uint8_t buffer[64]; // 64 bytes } poly1305_state_internal; /* 448 bytes total + 63 bytes for alignment = 511 bytes raw */ @@ -85,7 +85,7 @@ static inline poly1305_state_internal *poly1305_aligned_state( return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); } -/* copy 0-63 bytes */ +// copy 0-63 bytes static inline void poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) { size_t offset = src - dst; @@ -117,7 +117,7 @@ poly1305_block_copy(uint8_t *dst, const uint8_t *src, size_t bytes) { } } -/* zero 0-15 bytes */ +// zero 0-15 bytes static inline void poly1305_block_zero(uint8_t *dst, size_t bytes) { if (bytes & 8) { *(uint64_t *)dst = 0; @@ -146,7 +146,7 @@ void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { uint64_t r0, r1, r2; uint64_t t0, t1; - /* clamp key */ + // clamp key t0 = U8TO64_LE(key + 0); t1 = U8TO64_LE(key + 8); r0 = t0 & 0xffc0fffffff; @@ -156,7 +156,7 @@ void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { t1 >>= 24; r2 = t1 & 0x00ffffffc0f; - /* store r in un-used space of st->P[1] */ + // store r in un-used space of st->P[1] p = &st->P[1]; p->R20.d[1] = (uint32_t)(r0); p->R20.d[3] = (uint32_t)(r0 >> 32); @@ -165,13 +165,13 @@ void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { p->R22.d[1] = (uint32_t)(r2); p->R22.d[3] = (uint32_t)(r2 >> 32); - /* store pad */ + // store pad p->R23.d[1] = U8TO32_LE(key + 16); p->R23.d[3] = U8TO32_LE(key + 20); p->R24.d[1] = U8TO32_LE(key + 24); p->R24.d[3] = U8TO32_LE(key + 28); - /* H = 0 */ + // H = 0 st->H[0] = _mm_setzero_si128(); st->H[1] = _mm_setzero_si128(); st->H[2] = _mm_setzero_si128(); @@ -196,7 +196,7 @@ static void poly1305_first_block(poly1305_state_internal *st, uint64_t c; uint64_t i; - /* pull out stored info */ + // pull out stored info p = &st->P[1]; r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; @@ -205,7 +205,7 @@ static void poly1305_first_block(poly1305_state_internal *st, pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; - /* compute powers r^2,r^4 */ + // compute powers r^2,r^4 r20 = r0; r21 = r1; r22 = r2; @@ -249,7 +249,7 @@ static void poly1305_first_block(poly1305_state_internal *st, p--; } - /* put saved info back */ + // put saved info back p = &st->P[1]; p->R20.d[1] = (uint32_t)(r0); p->R20.d[3] = (uint32_t)(r0 >> 32); @@ -262,7 +262,7 @@ static void poly1305_first_block(poly1305_state_internal *st, p->R24.d[1] = (uint32_t)(pad1); p->R24.d[3] = (uint32_t)(pad1 >> 32); - /* H = [Mx,My] */ + // H = [Mx,My] T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), _mm_loadl_epi64((const xmmi *)(m + 16))); T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), @@ -294,7 +294,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, H4 = st->H[4]; while (bytes >= 64) { - /* H *= [r^4,r^4] */ + // H *= [r^4,r^4] p = &st->P[0]; T0 = _mm_mul_epu32(H0, p->R20.v); T1 = _mm_mul_epu32(H0, p->R21.v); @@ -342,7 +342,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, T5 = _mm_mul_epu32(H4, p->R20.v); T4 = _mm_add_epi64(T4, T5); - /* H += [Mx,My]*[r^2,r^2] */ + // H += [Mx,My]*[r^2,r^2] T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), _mm_loadl_epi64((const xmmi *)(m + 16))); T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), @@ -406,7 +406,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, T5 = _mm_mul_epu32(M4, p->R20.v); T4 = _mm_add_epi64(T4, T5); - /* H += [Mx,My] */ + // H += [Mx,My] T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), _mm_loadl_epi64((const xmmi *)(m + 48))); T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), @@ -424,7 +424,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, T3 = _mm_add_epi64(T3, M3); T4 = _mm_add_epi64(T4, M4); - /* reduce */ + // reduce C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); @@ -447,7 +447,7 @@ static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); - /* H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) */ + // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) H0 = T0; H1 = T1; H2 = T2; @@ -488,11 +488,11 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, H3 = st->H[3]; H4 = st->H[4]; - /* p = [r^2,r^2] */ + // p = [r^2,r^2] p = &st->P[1]; if (bytes >= 32) { - /* H *= [r^2,r^2] */ + // H *= [r^2,r^2] T0 = _mm_mul_epu32(H0, p->R20.v); T1 = _mm_mul_epu32(H0, p->R21.v); T2 = _mm_mul_epu32(H0, p->R22.v); @@ -539,7 +539,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, T5 = _mm_mul_epu32(H4, p->R20.v); T4 = _mm_add_epi64(T4, T5); - /* H += [Mx,My] */ + // H += [Mx,My] T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), _mm_loadl_epi64((const xmmi *)(m + 16))); T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), @@ -557,7 +557,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, T3 = _mm_add_epi64(T3, M3); T4 = _mm_add_epi64(T4, M4); - /* reduce */ + // reduce C1 = _mm_srli_epi64(T0, 26); C2 = _mm_srli_epi64(T3, 26); T0 = _mm_and_si128(T0, MMASK); @@ -580,7 +580,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); - /* H = (H*[r^2,r^2] + [Mx,My]) */ + // H = (H*[r^2,r^2] + [Mx,My]) H0 = T0; H1 = T1; H2 = T2; @@ -590,7 +590,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, consumed = 32; } - /* finalize, H *= [r^2,r] */ + // finalize, H *= [r^2,r] r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; @@ -605,7 +605,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, p->S23.d[2] = p->R23.d[2] * 5; p->S24.d[2] = p->R24.d[2] * 5; - /* H *= [r^2,r] */ + // H *= [r^2,r] T0 = _mm_mul_epu32(H0, p->R20.v); T1 = _mm_mul_epu32(H0, p->R21.v); T2 = _mm_mul_epu32(H0, p->R22.v); @@ -674,7 +674,7 @@ static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, T3 = _mm_and_si128(T3, MMASK); T4 = _mm_add_epi64(T4, C1); - /* H = H[0]+H[1] */ + // H = H[0]+H[1] H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); @@ -713,7 +713,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, poly1305_state_internal *st = poly1305_aligned_state(state); size_t want; - /* need at least 32 initial bytes to start the accelerated branch */ + // need at least 32 initial bytes to start the accelerated branch if (!st->started) { if ((st->leftover == 0) && (bytes > 32)) { poly1305_first_block(st, m); @@ -734,7 +734,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, st->started = 1; } - /* handle leftover */ + // handle leftover if (st->leftover) { want = poly1305_min(64 - st->leftover, bytes); poly1305_block_copy(st->buffer + st->leftover, m, want); @@ -748,7 +748,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, st->leftover = 0; } - /* process 64 byte blocks */ + // process 64 byte blocks if (bytes >= 64) { want = (bytes & ~63); poly1305_blocks(st, m, want); @@ -779,7 +779,7 @@ void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { m += consumed; } - /* st->HH will either be 0 or have the combined result */ + // st->HH will either be 0 or have the combined result h0 = st->HH[0]; h1 = st->HH[1]; h2 = st->HH[2]; @@ -826,7 +826,7 @@ poly1305_donna_mul: goto poly1305_donna_atleast16bytes; } -/* final bytes */ +// final bytes poly1305_donna_atmost15bytes: if (!leftover) { goto poly1305_donna_finish; @@ -870,7 +870,7 @@ poly1305_donna_finish: h1 = (h1 & nc) | (g1 & c); h2 = (h2 & nc) | (g2 & c); - /* pad */ + // pad t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; h0 += (t0 & 0xfffffffffff); @@ -887,4 +887,4 @@ poly1305_donna_finish: U64TO8_LE(mac + 8, ((h1 >> 20) | (h2 << 24))); } -#endif /* !OPENSSL_WINDOWS && OPENSSL_X86_64 */ +#endif // !OPENSSL_WINDOWS && OPENSSL_X86_64 |