From 6c982b937348d054079dcf15d1f3d2ed088d730e Mon Sep 17 00:00:00 2001 From: Logan007 Date: Wed, 9 Sep 2020 14:09:19 +0545 Subject: [PATCH] small cc20 sse speed-up --- include/cc20.h | 1 - src/cc20.c | 165 +++++++++++++++++++++---------------------------- 2 files changed, 72 insertions(+), 94 deletions(-) diff --git a/include/cc20.h b/include/cc20.h index 44ef07a..e886d8e 100644 --- a/include/cc20.h +++ b/include/cc20.h @@ -43,7 +43,6 @@ typedef struct cc20_context_t { typedef struct cc20_context { uint32_t keystream32[16]; - uint32_t state[16]; uint8_t key[CC20_KEY_BYTES]; } cc20_context_t; diff --git a/src/cc20.c b/src/cc20.c index 86177bd..c9a4125 100644 --- a/src/cc20.c +++ b/src/cc20.c @@ -78,20 +78,10 @@ int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len, #elif defined (__SSE2__) // SSE ---------------------------------------------------------- -// taken (and modified and enhanced) from +// taken (and heavily modified and enhanced) from // https://github.com/Ginurx/chacha20-c (public domain) -static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { - - const uint8_t *magic_constant = (uint8_t*)"expand 32-byte k"; - - memcpy(&(ctx->state[ 0]), magic_constant, 16); - memcpy(&(ctx->state[ 4]), ctx->key, CC20_KEY_BYTES); - memcpy(&(ctx->state[12]), nonce, CC20_IV_SIZE); -} - - #define SL _mm_slli_epi32 #define SR _mm_srli_epi32 #define XOR _mm_xor_si128 @@ -99,15 +89,17 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { #define ADD _mm_add_epi32 #define ROL(X,r) (XOR(SL(X,r),SR(X,(32-r)))) +#define ONE _mm_setr_epi32(1, 0, 0, 0) + #if defined (__SSSE3__) // --- SSSE3 #define L8 _mm_set_epi32(0x0e0d0c0fL, 0x0a09080bL, 0x06050407L, 0x02010003L) #define L16 _mm_set_epi32(0x0d0c0f0eL, 0x09080b0aL, 0x05040706L, 0x01000302L) #define ROL8(X) ( _mm_shuffle_epi8(X, L8)) /* SSSE 3 */ #define ROL16(X) ( _mm_shuffle_epi8(X, L16)) /* SSSE 3 */ -#else // --- regular SSE2 --------- +#else // --- regular SSE2 ---------- #define ROL8(X) ROL(X,8) #define ROL16(X) ROL(X,16) -#endif // ------------------------- +#endif // -------------------------- #define CC20_PERMUTE_ROWS(A,B,C,D) \ B = _mm_shuffle_epi32(B, _MM_SHUFFLE(0, 3, 2, 1)); \ @@ -135,99 +127,86 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { CC20_ODD_ROUND (A, B, C, D); \ CC20_EVEN_ROUND(A, B, C, D) -static void cc20_block_next(cc20_context_t *ctx) { - - uint32_t *counter = ctx->state + 12; - - __m128i a, b, c, d, k0, k1, k2, k3; - - a = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 0])); - b = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 4])); - c = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 8])); - d = _mm_loadu_si128 ((__m128i*)&(ctx->state[12])); - - k0 = a; - k1 = b; - k2 = c; - k3 = d; - - // 10 double rounds - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - CC20_DOUBLE_ROUND(k0, k1, k2, k3); - - k0 = ADD(k0, a); - k1 = ADD(k1, b); - k2 = ADD(k2, c); - k3 = ADD(k3, d); - - _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 0]), k0); - _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 4]), k1); - _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 8]), k2); - _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[12]), k3); - - // increment counter, make sure it is and stays little endian in memory - *counter = htole32(le32toh(*counter)+1); - -} - - -static void cc20_init_context(cc20_context_t *ctx, const uint8_t *nonce) { - - cc20_init_block(ctx, nonce); -} - - int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len, const unsigned char *iv, cc20_context_t *ctx) { - uint8_t *keystream8 = (uint8_t*)ctx->keystream32; - uint32_t * in_p = (uint32_t*)in; - uint32_t * out_p = (uint32_t*)out; - size_t tmp_len = in_len; + __m128i a, b, c, d, k0, k1, k2, k3; - cc20_init_context(ctx, iv); + uint8_t *keystream8 = (uint8_t*)ctx->keystream32; - while(in_len >= 64) { + const uint8_t *magic_constant = (uint8_t*)"expand 32-byte k"; - cc20_block_next(ctx); + a = _mm_loadu_si128 ((__m128i*)magic_constant); + b = _mm_loadu_si128 ((__m128i*)(ctx->key)); + c = _mm_loadu_si128 ( (__m128i*)((ctx->key)+16)); + d = _mm_loadu_si128 ((__m128i*)iv); + + while (in_len >= 64) { + + k0 = a; k1 = b; k2 = c; k3 = d; + + // 10 double rounds + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + + k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d); + + _mm_storeu_si128 ((__m128i*)out, + _mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k0)); + in += 16; out += 16; + _mm_storeu_si128 ((__m128i*)out, + _mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k1)); + in += 16; out += 16; + _mm_storeu_si128 ((__m128i*)out, + _mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k2)); + in += 16; out += 16; + _mm_storeu_si128 ((__m128i*)out, + _mm_xor_si128 (_mm_loadu_si128((__m128i*)in), k3)); + in += 16; out += 16; + + // increment counter, make sure it is and stays little endian in memory + d = _mm_add_epi32(d, ONE); - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 0]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 1]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 2]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 3]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 4]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 5]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 6]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 7]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 8]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 9]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[10]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[11]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[12]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[13]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[14]; in_p++; out_p++; - *(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[15]; in_p++; out_p++; in_len -= 64; } - if(in_len > 0) { - - cc20_block_next(ctx); - - tmp_len -= in_len; + if (in_len) { + k0 = a; k1 = b; k2 = c; k3 = d; + + // 10 double rounds + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + CC20_DOUBLE_ROUND(k0, k1, k2, k3); + + k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d); + + _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 0]), k0); + _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 4]), k1); + _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 8]), k2); + _mm_storeu_si128 ((__m128i*)&(ctx->keystream32[12]), k3); + + // keep in mind that out and in got increased inside the last loop + // and point to current position now while(in_len > 0) { - out[tmp_len] = in[tmp_len] ^ keystream8[tmp_len%64]; - tmp_len++; in_len--; + out[in_len] = in[in_len] ^ keystream8[in_len]; } + } }