|
|
@ -78,20 +78,10 @@ int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len, |
|
|
|
#elif defined (__SSE2__) // SSE ----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
// taken (and modified and enhanced) from
|
|
|
|
// taken (and heavily modified and enhanced) from
|
|
|
|
// https://github.com/Ginurx/chacha20-c (public domain)
|
|
|
|
|
|
|
|
|
|
|
|
static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { |
|
|
|
|
|
|
|
const uint8_t *magic_constant = (uint8_t*)"expand 32-byte k"; |
|
|
|
|
|
|
|
memcpy(&(ctx->state[ 0]), magic_constant, 16); |
|
|
|
memcpy(&(ctx->state[ 4]), ctx->key, CC20_KEY_BYTES); |
|
|
|
memcpy(&(ctx->state[12]), nonce, CC20_IV_SIZE); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#define SL _mm_slli_epi32 |
|
|
|
#define SR _mm_srli_epi32 |
|
|
|
#define XOR _mm_xor_si128 |
|
|
@ -99,15 +89,18 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { |
|
|
|
#define ADD _mm_add_epi32 |
|
|
|
#define ROL(X,r) (XOR(SL(X,r),SR(X,(32-r)))) |
|
|
|
|
|
|
|
#if defined (__SSE3__) // --- SSE3
|
|
|
|
#define ONE _mm_setr_epi32(1, 0, 0, 0) |
|
|
|
#define TWO _mm_setr_epi32(2, 0, 0, 0) |
|
|
|
|
|
|
|
#if defined (__SSSE3__) // --- SSSE3
|
|
|
|
#define L8 _mm_set_epi32(0x0e0d0c0fL, 0x0a09080bL, 0x06050407L, 0x02010003L) |
|
|
|
#define L16 _mm_set_epi32(0x0d0c0f0eL, 0x09080b0aL, 0x05040706L, 0x01000302L) |
|
|
|
#define ROL8(X) ( _mm_shuffle_epi8(X, L8)) /* SSE 3 */ |
|
|
|
#define ROL16(X) ( _mm_shuffle_epi8(X, L16)) /* SSE 3 */ |
|
|
|
#else // --- regular SSE2 --------
|
|
|
|
#define ROL8(X) ( _mm_shuffle_epi8(X, L8)) /* SSSE 3 */ |
|
|
|
#define ROL16(X) ( _mm_shuffle_epi8(X, L16)) /* SSSE 3 */ |
|
|
|
#else // --- regular SSE2 ----------
|
|
|
|
#define ROL8(X) ROL(X,8) |
|
|
|
#define ROL16(X) ROL(X,16) |
|
|
|
#endif // ------------------------
|
|
|
|
#endif // --------------------------
|
|
|
|
|
|
|
|
#define CC20_PERMUTE_ROWS(A,B,C,D) \ |
|
|
|
B = _mm_shuffle_epi32(B, _MM_SHUFFLE(0, 3, 2, 1)); \ |
|
|
@ -135,116 +128,110 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { |
|
|
|
CC20_ODD_ROUND (A, B, C, D); \ |
|
|
|
CC20_EVEN_ROUND(A, B, C, D) |
|
|
|
|
|
|
|
static void cc20_block_next(cc20_context_t *ctx) { |
|
|
|
#define STOREXOR(O,I,X) \ |
|
|
|
_mm_storeu_si128 ((__m128i*)O, \ |
|
|
|
_mm_xor_si128 (_mm_loadu_si128((__m128i*)I), X)); \ |
|
|
|
I += 16; O += 16 \ |
|
|
|
|
|
|
|
uint32_t *counter = ctx->state + 12; |
|
|
|
uint32_t cnt; |
|
|
|
int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len, |
|
|
|
const unsigned char *iv, cc20_context_t *ctx) { |
|
|
|
|
|
|
|
__m128i a, b, c, d, k0, k1, k2, k3; |
|
|
|
__m128i a, b, c, d, k0, k1, k2, k3, k4, k5, k6, k7; |
|
|
|
|
|
|
|
a = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 0])); |
|
|
|
b = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 4])); |
|
|
|
c = _mm_loadu_si128 ((__m128i*)&(ctx->state[ 8])); |
|
|
|
d = _mm_loadu_si128 ((__m128i*)&(ctx->state[12])); |
|
|
|
uint8_t *keystream8 = (uint8_t*)ctx->keystream32; |
|
|
|
|
|
|
|
k0 = a; |
|
|
|
k1 = b; |
|
|
|
k2 = c; |
|
|
|
k3 = d; |
|
|
|
const uint8_t *magic_constant = (uint8_t*)"expand 32-byte k"; |
|
|
|
|
|
|
|
// 10 double rounds
|
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
|
|
|
|
k0 = ADD(k0, a); |
|
|
|
k1 = ADD(k1, b); |
|
|
|
k2 = ADD(k2, c); |
|
|
|
k3 = ADD(k3, d); |
|
|
|
|
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 0]), k0); |
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 4]), k1); |
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 8]), k2); |
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[12]), k3); |
|
|
|
a = _mm_loadu_si128 ((__m128i*)magic_constant); |
|
|
|
b = _mm_loadu_si128 ((__m128i*)(ctx->key)); |
|
|
|
c = _mm_loadu_si128 ( (__m128i*)((ctx->key)+16)); |
|
|
|
d = _mm_loadu_si128 ((__m128i*)iv); |
|
|
|
|
|
|
|
// increment counter, make sure it is and stays little endian in memory
|
|
|
|
cnt = le32toh(counter[0]); |
|
|
|
counter[0] = htole32(++cnt); |
|
|
|
if(0 == counter[0]) { |
|
|
|
// wrap around occured, increment higher 32 bits of counter
|
|
|
|
// unlikely with 1,500 byte sized packets
|
|
|
|
cnt = le32toh(counter[1]); |
|
|
|
counter[1] = htole32(++cnt); |
|
|
|
if(0 == counter[1]) { |
|
|
|
// very unlikely
|
|
|
|
cnt = le32toh(counter[2]); |
|
|
|
counter[2] = htole32(++cnt); |
|
|
|
if(0 == counter[2]) { |
|
|
|
// extremely unlikely
|
|
|
|
cnt = le32toh(counter[3]); |
|
|
|
counter[3] = htole32(++cnt); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
while (in_len >= 128) { |
|
|
|
|
|
|
|
k0 = a; k1 = b; k2 = c; k3 = d; |
|
|
|
k4 = a; k5 = b; k6 = c; k7 = ADD(d, ONE); |
|
|
|
|
|
|
|
static void cc20_init_context(cc20_context_t *ctx, const uint8_t *nonce) { |
|
|
|
// 10 double rounds -- in parallel to make better use of all 8 SSE registers
|
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); CC20_DOUBLE_ROUND(k4, k5, k6, k7); |
|
|
|
|
|
|
|
cc20_init_block(ctx, nonce); |
|
|
|
} |
|
|
|
k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d); |
|
|
|
k4 = ADD(k4, a); k5 = ADD(k5, b); k6 = ADD(k6, c); k7 = ADD(k7, d); k7 = ADD(k7, ONE); |
|
|
|
|
|
|
|
STOREXOR(out, in, k0); STOREXOR(out, in, k1); STOREXOR(out, in, k2); STOREXOR(out, in, k3); |
|
|
|
STOREXOR(out, in, k4); STOREXOR(out, in, k5); STOREXOR(out, in, k6); STOREXOR(out, in, k7); |
|
|
|
|
|
|
|
int cc20_crypt (unsigned char *out, const unsigned char *in, size_t in_len, |
|
|
|
const unsigned char *iv, cc20_context_t *ctx) { |
|
|
|
// increment counter, make sure it is and stays little endian in memory
|
|
|
|
d = ADD(d, TWO); |
|
|
|
|
|
|
|
uint8_t *keystream8 = (uint8_t*)ctx->keystream32; |
|
|
|
uint32_t * in_p = (uint32_t*)in; |
|
|
|
uint32_t * out_p = (uint32_t*)out; |
|
|
|
size_t tmp_len = in_len; |
|
|
|
in_len -= 128; |
|
|
|
} |
|
|
|
|
|
|
|
cc20_init_context(ctx, iv); |
|
|
|
if (in_len >= 64) { |
|
|
|
|
|
|
|
while(in_len >= 64) { |
|
|
|
k0 = a; k1 = b; k2 = c; k3 = d; |
|
|
|
|
|
|
|
cc20_block_next(ctx); |
|
|
|
// 10 double rounds
|
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
|
|
|
|
k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d); |
|
|
|
|
|
|
|
STOREXOR(out, in, k0); STOREXOR(out, in, k1); STOREXOR(out, in, k2); STOREXOR(out, in, k3); |
|
|
|
|
|
|
|
// increment counter, make sure it is and stays little endian in memory
|
|
|
|
d = ADD(d, ONE); |
|
|
|
|
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 0]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 1]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 2]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 3]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 4]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 5]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 6]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 7]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 8]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[ 9]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[10]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[11]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[12]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[13]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[14]; in_p++; out_p++; |
|
|
|
*(uint32_t*)out_p = *(uint32_t*)in_p ^ ctx->keystream32[15]; in_p++; out_p++; |
|
|
|
in_len -= 64; |
|
|
|
} |
|
|
|
|
|
|
|
if(in_len > 0) { |
|
|
|
if (in_len) { |
|
|
|
|
|
|
|
cc20_block_next(ctx); |
|
|
|
k0 = a; k1 = b; k2 = c; k3 = d; |
|
|
|
|
|
|
|
tmp_len -= in_len; |
|
|
|
// 10 double rounds
|
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
CC20_DOUBLE_ROUND(k0, k1, k2, k3); |
|
|
|
|
|
|
|
k0 = ADD(k0, a); k1 = ADD(k1, b); k2 = ADD(k2, c); k3 = ADD(k3, d); |
|
|
|
|
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 0]), k0); |
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 4]), k1); |
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[ 8]), k2); |
|
|
|
_mm_storeu_si128 ((__m128i*)&(ctx->keystream32[12]), k3); |
|
|
|
|
|
|
|
// keep in mind that out and in got increased inside the last loop
|
|
|
|
// and point to current position now
|
|
|
|
while(in_len > 0) { |
|
|
|
out[tmp_len] = in[tmp_len] ^ keystream8[tmp_len%64]; |
|
|
|
tmp_len++; |
|
|
|
in_len--; |
|
|
|
out[in_len] = in[in_len] ^ keystream8[in_len]; |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
@ -286,7 +273,6 @@ static void cc20_init_block(cc20_context_t *ctx, const uint8_t nonce[]) { |
|
|
|
static void cc20_block_next(cc20_context_t *ctx) { |
|
|
|
|
|
|
|
uint32_t *counter = ctx->state + 12; |
|
|
|
uint32_t c; |
|
|
|
|
|
|
|
ctx->keystream32[ 0] = ctx->state[ 0]; |
|
|
|
ctx->keystream32[ 1] = ctx->state[ 1]; |
|
|
@ -335,24 +321,7 @@ static void cc20_block_next(cc20_context_t *ctx) { |
|
|
|
ctx->keystream32[15] += ctx->state[15]; |
|
|
|
|
|
|
|
// increment counter, make sure it is and stays little endian in memory
|
|
|
|
c = le32toh(counter[0]); |
|
|
|
counter[0] = htole32(++c); |
|
|
|
if(0 == counter[0]) { |
|
|
|
// wrap around occured, increment higher 32 bits of counter
|
|
|
|
// unlikely with 1,500 byte sized packets
|
|
|
|
c = le32toh(counter[1]); |
|
|
|
counter[1] = htole32(++c); |
|
|
|
if(0 == counter[1]) { |
|
|
|
// very unlikely
|
|
|
|
c = le32toh(counter[2]); |
|
|
|
counter[2] = htole32(++c); |
|
|
|
if(0 == counter[2]) { |
|
|
|
// extremely unlikely
|
|
|
|
c = le32toh(counter[3]); |
|
|
|
counter[3] = htole32(++c); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
*counter = htole32(le32toh(*counter)+1); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|