From a409d611af33cbb6843f2e220b858f27b8ccb76f Mon Sep 17 00:00:00 2001 From: Logan007 Date: Wed, 3 Jun 2020 21:48:53 +0545 Subject: [PATCH] added SSE support --- speck.c | 279 ++++++++++++++++++++++++++++++++++++++-------- transform_speck.c | 10 +- 2 files changed, 235 insertions(+), 54 deletions(-) diff --git a/speck.c b/speck.c index abdb59e..fbda3a9 100644 --- a/speck.c +++ b/speck.c @@ -1,51 +1,232 @@ // cipher SPECK -- 128 bit block size -- 256 bit key size // taken from (and modified: removed pure crypto-stream generation and seperated key expansion) -// https://github.com/nsacyber/simon-speck-supercop/blob/master/crypto_stream/speck128256ctr/ref/stream.c +// https://github.com/nsacyber/simon-speck-supercop/blob/master/crypto_stream/speck128256ctr/ #include #include #include -// #define u64 unsigned long long -#define u64 uint64_t +#ifdef __SSE4_2__ // SSE support ---------------------------------------------------- -#define ROR64(x,r) (((x)>>(r))|((x)<<(64-(r)))) -#define ROL64(x,r) (((x)<<(r))|((x)>>(64-(r)))) -#define R(x,y,k) (x=ROR64(x,8), x+=y, x^=k, y=ROL64(y,3), y^=x) -#define RI(x,y,k) (y^=x, y=ROR64(y,3), x^=k, x-=y, x=ROL64(x,8)) +#include +#define u32 unsigned +#define u64 unsigned long long +#define u128 __m128i -static int speck_encrypt(u64 *u, u64 *v, u64 key[]) { +#define LCS(x,r) (((x)<>(64-r))) +#define RCS(x,r) (((x)>>r)|((x)<<(64-r))) - u64 i, x = *u, y = *v; +#define XOR _mm_xor_si128 +#define AND _mm_and_si128 +#define ADD _mm_add_epi64 +#define SL _mm_slli_epi64 +#define SR _mm_srli_epi64 - for (i = 0; i < 34; i++) - R (x, y, key[i]); +#define _q SET(0x1,0x0) +#define _two SET(0x2,0x2) - *u = x; *v = y; +#define SET _mm_set_epi64x +#define SET1(X,c) (X=SET(c,c)) +#define SET2(X,c) (X=SET(c,c), X=ADD(X,_q)) + +#define LOW _mm_unpacklo_epi64 +#define HIGH _mm_unpackhi_epi64 +#define LD(ip) _mm_loadu_si128((__m128i *)(ip)) +#define ST(ip,X) _mm_storeu_si128((__m128i *)(ip),X) +#define STORE(out,X,Y) (ST(out,LOW(Y,X)), ST(out+16,HIGH(Y,X))) +#define STORE_ALT(out,X,Y) (ST(out,LOW(X,Y)), ST(out+16,HIGH(X,Y))) +#define XOR_STORE(in,out,X,Y) (ST(out,XOR(LD(in),LOW(Y,X))), ST(out+16,XOR(LD(in+16),HIGH(Y,X)))) +#define XOR_STORE_ALT(in,out,X,Y) (ST(out,XOR(LD(in),LOW(X,Y))), ST(out+16,XOR(LD(in+16),HIGH(X,Y)))) + +#define SHFL _mm_shuffle_epi8 +#define R8 _mm_set_epi64x(0x080f0e0d0c0b0a09LL,0x0007060504030201LL) +#define L8 _mm_set_epi64x(0x0e0d0c0b0a09080fLL,0x0605040302010007LL) +#define ROL8(X) (SHFL(X,L8)) +#define ROR8(X) (SHFL(X,R8)) +#define ROL(X,r) (XOR(SL(X,r),SR(X,(64-r)))) +#define ROR(X,r) (XOR(SR(X,r),SL(X,(64-r)))) + +#define numrounds 34 +#define numkeywords 4 + +#define R(X,Y,k) (X=XOR(ADD(ROR8(X),Y),k), Y=XOR(ROL(Y,3),X)) + +#define Rx2(X,Y,k) (R(X[0],Y[0],k)) +#define Rx4(X,Y,k) (R(X[0],Y[0],k), R(X[1],Y[1],k)) +#define Rx6(X,Y,k) (R(X[0],Y[0],k), R(X[1],Y[1],k), R(X[2],Y[2],k)) + +#define Rx8(X,Y,k) (X[0]=ROR8(X[0]), X[0]=ADD(X[0],Y[0]), X[1]=ROR8(X[1]), X[1]=ADD(X[1],Y[1]), \ + X[2]=ROR8(X[2]), X[2]=ADD(X[2],Y[2]), X[3]=ROR8(X[3]), X[3]=ADD(X[3],Y[3]), \ + X[0]=XOR(X[0],k), X[1]=XOR(X[1],k), X[2]=XOR(X[2],k), X[3]=XOR(X[3],k), \ + Z[0]=Y[0], Z[1]=Y[1], Z[2]=Y[2], Z[3]=Y[3], \ + Z[0]=SL(Z[0],3), Y[0]=SR(Y[0],61), Z[1]=SL(Z[1],3), Y[1]=SR(Y[1],61), \ + Z[2]=SL(Z[2],3), Y[2]=SR(Y[2],61), Z[3]=SL(Z[3],3), Y[3]=SR(Y[3],61), \ + Y[0]=XOR(Y[0],Z[0]), Y[1]=XOR(Y[1],Z[1]), Y[2]=XOR(Y[2],Z[2]), Y[3]=XOR(Y[3],Z[3]), \ + Y[0]=XOR(X[0],Y[0]), Y[1]=XOR(X[1],Y[1]), Y[2]=XOR(X[2],Y[2]), Y[3]=XOR(X[3],Y[3])) + +#define Rx1(x,y,k) (x[0]=RCS(x[0],8), x[0]+=y[0], x[0]^=k, y[0]=LCS(y[0],3), y[0]^=x[0]) + +#define Rx1b(x,y,k) (x=RCS(x,8), x+=y, x^=k, y=LCS(y,3), y^=x) + +#define Encrypt(X,Y,k,n) (Rx##n(X,Y,k[0]), Rx##n(X,Y,k[1]), Rx##n(X,Y,k[2]), Rx##n(X,Y,k[3]), Rx##n(X,Y,k[4]), Rx##n(X,Y,k[5]), Rx##n(X,Y,k[6]), Rx##n(X,Y,k[7]), \ + Rx##n(X,Y,k[8]), Rx##n(X,Y,k[9]), Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \ + Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \ + Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]), \ + Rx##n(X,Y,k[32]), Rx##n(X,Y,k[33])) + +#define RK(X,Y,k,key,i) (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X) + +#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0), RK(C,A,k,key,1), RK(D,A,k,key,2), RK(B,A,k,key,3), RK(C,A,k,key,4), RK(D,A,k,key,5), RK(B,A,k,key,6), \ + RK(C,A,k,key,7), RK(D,A,k,key,8), RK(B,A,k,key,9), RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \ + RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \ + RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \ + RK(C,A,k,key,28), RK(D,A,k,key,29), RK(B,A,k,key,30), RK(C,A,k,key,31), RK(D,A,k,key,32), RK(B,A,k,key,33)) + +typedef struct { + u128 rk[34]; + u64 key[34]; +} speck_context_t; + + +static int speck_encrypt_xor (unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t ctx, int numbytes) { + + u64 x[2], y[2]; + u128 X[4], Y[4], Z[4]; + + if (numbytes == 16) { + x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++; + Encrypt (x, y, ctx.key, 1); + ((u64 *)out)[1] = x[0]; ((u64 *)out)[0] = y[0]; + return 0; + } + + SET1 (X[0], nonce[1]); SET2 (Y[0], nonce[0]); + + if (numbytes == 32) + Encrypt (X, Y, ctx.rk, 2); + else { + X[1] = X[0]; Y[1] = ADD (Y[0], _two); + if (numbytes == 64) + Encrypt (X, Y, ctx.rk, 4); + else { + X[2] = X[0]; Y[2] = ADD (Y[1], _two); + if (numbytes == 96) + Encrypt (X, Y, ctx.rk, 6); + else { + X[3] = X[0]; Y[3] = ADD (Y[2], _two); + Encrypt (X, Y, ctx.rk, 8); + } + } + } + + nonce[0] += (numbytes>>4); + + XOR_STORE (in, out, X[0], Y[0]); + if (numbytes >= 64) + XOR_STORE (in + 32, out + 32, X[1], Y[1]); + if (numbytes >= 96) + XOR_STORE (in + 64, out + 64, X[2], Y[2]); + if (numbytes >= 128) + XOR_STORE (in + 96, out + 96, X[3], Y[3]); return 0; } -// not neccessary for CTR mode -/* static int speck_decrypt(u64 *u, u64 *v, u64 key[]) { +int speck_expand_key (const unsigned char *k, speck_context_t *ctx) { + + u64 K[4]; + size_t i; + for(i = 0; i < numkeywords; i++) + K[i] = ((u64 *)k)[i]; + + EK (K[0], K[1], K[2], K[3], ctx->rk, ctx->key); + + return 0; +} + + +int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long inlen, + const unsigned char *n, speck_context_t ctx) { int i; - u64 x=*u,y=*v; - for (i = 33; i >= 0 ;i--) - RI (x, y, key[i]); + u64 nonce[2]; + unsigned char block[16]; + u64 * const block64 = (u64 *)block; + + if (!inlen) + return 0; + + nonce[0] = ((u64 *)n)[0]; + nonce[1] = ((u64 *)n)[1]; + + while (inlen >= 128) { + speck_encrypt_xor (out, in, nonce, ctx, 128); + in += 128; inlen -= 128; out += 128; + } + + if (inlen >= 96) { + speck_encrypt_xor (out, in, nonce, ctx, 96); + in += 96; inlen -= 96; out += 96; + } + + if (inlen >= 64) { + speck_encrypt_xor (out, in, nonce, ctx, 64); + in += 64; inlen -= 64; out += 64; + } + + if (inlen >= 32) { + speck_encrypt_xor (out, in, nonce, ctx, 32); + in += 32; inlen -= 32; out += 32; + } + + if (inlen >= 16) { + speck_encrypt_xor (block, in, nonce, ctx, 16); + ((u64 *)out)[0] = block64[0] ^ ((u64 *)in)[0]; + ((u64 *)out)[1] = block64[1] ^ ((u64 *)in)[1]; + in += 16; inlen -= 16; out += 16; + } + + if (inlen > 0) { + speck_encrypt_xor (block, in, nonce, ctx, 16); + for (i = 0; i < inlen; i++) + out[i] = block[i] ^ in[i]; + } + + return 0; +} + + +#else // (close to) C reference code -------------------------------------------- + + +#define u64 uint64_t + +#define ROR64(x,r) (((x)>>(r))|((x)<<(64-(r)))) +#define ROL64(x,r) (((x)<<(r))|((x)>>(64-(r)))) +#define R(x,y,k) (x=ROR64(x,8), x+=y, x^=k, y=ROL64(y,3), y^=x) + +typedef struct { + u64 key[34]; +} speck_context_t; + + +static int speck_encrypt (u64 *u, u64 *v, speck_context_t ctx) { + + u64 i, x = *u, y = *v; + + for (i = 0; i < 34; i++) + R (x, y, ctx.key[i]); *u = x; *v = y; return 0; -} */ +} -int speck_ctr (unsigned char *out, const unsigned char *in, - unsigned long long inlen, - const unsigned char *n, - u64 rk[]) { +int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long inlen, + const unsigned char *n, speck_context_t ctx) { u64 i, nonce[2], x, y, t; unsigned char *block = malloc (16); @@ -54,15 +235,13 @@ int speck_ctr (unsigned char *out, const unsigned char *in, free (block); return 0; } -// !!! htole64 !!! nonce[0] = htole64 ( ((u64*)n)[0] ); nonce[1] = htole64 ( ((u64*)n)[1] ); t=0; - while(inlen >= 16) { + while (inlen >= 16) { x = nonce[1]; y = nonce[0]; nonce[0]++; - speck_encrypt (&x, &y, rk); -// !!! htole64 !!! + speck_encrypt (&x, &y, ctx); ((u64 *)out)[1+t] = htole64 (x ^ ((u64 *)in)[1+t]); ((u64 *)out)[0+t] = htole64 (y ^ ((u64 *)in)[0+t]); t += 2; @@ -70,42 +249,41 @@ int speck_ctr (unsigned char *out, const unsigned char *in, } if (inlen > 0) { x = nonce[1]; y = nonce[0]; - speck_encrypt (&x, &y, rk); -// !!! htole64 !!! + speck_encrypt (&x, &y, ctx); ((u64 *)block)[1] = htole64 (x); ((u64 *)block)[0] = htole64 (y); - for (i=0; i < inlen; i++) + for (i = 0; i < inlen; i++) out[i + 8*t] = block[i] ^ in[i + 8*t]; } free (block); - return 0; } -int speck_expand_key (const unsigned char *k, u64 rk[]) { +int speck_expand_key (const unsigned char *k, speck_context_t * ctx) { u64 K[4]; u64 i; - for (i=0; i < 4; i++) -// !!! htole64 !!! + for (i = 0; i < 4; i++) K[i] = htole64 ( ((u64 *)k)[i] ); - - u64 D = K[3], C = K[2], B = K[1], A = K[0]; - for (i = 0; i < 33; i += 3) { - rk[i ] = A; R (B, A, i ); - rk[i+1] = A; R (C, A, i + 1); - rk[i+2] = A; R (D, A, i + 2); + ctx->key[i ] = K[0]; + R (K[1], K[0], i ); + ctx->key[i+1] = K[0]; + R (K[2], K[0], i + 1); + ctx->key[i+2] = K[0]; + R (K[3], K[0], i + 2); } - rk[33] = A; - + ctx->key[33] = K[0]; return 1; } +#endif // SSE, C ref + + int speck_test () { uint8_t key[32] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, @@ -123,18 +301,22 @@ int speck_test () { uint8_t ct[16] = { 0x43, 0x8f, 0x18, 0x9c, 0x8d, 0xb4, 0xee, 0x4e, 0x3e, 0xf5, 0xc0, 0x05, 0x04, 0x01, 0x09, 0x41 }; - u64 round_keys[34]; - speck_expand_key (key, round_keys); - speck_ctr (pt, pt, 16, iv, round_keys); -fprintf (stderr, "rk00: %016lx\n", round_keys[0]); -fprintf (stderr, "rk33: %016lx\n", round_keys[33]); + speck_context_t ctx; + + speck_expand_key (key, &ctx); + + speck_ctr (pt, pt, 16, iv, ctx); + + u64 i; +fprintf (stderr, "rk00: %016llx\n", ctx.key[0]); +fprintf (stderr, "rk33: %016llx\n", ctx.key[33]); fprintf (stderr, "out : %016lx\n", *(uint64_t*)pt); -fprintf (stderr, "mem : " ); for (int i=0; i < 16; i++) fprintf (stderr, "%02x ", pt[i]); fprintf (stderr, "\n"); +fprintf (stderr, "mem : " ); for (i=0; i < 16; i++) fprintf (stderr, "%02x ", pt[i]); fprintf (stderr, "\n"); int ret = 1; - for (int i=0; i < 16; i++) + for (i=0; i < 16; i++) if (pt[i] != ct[i]) ret = 0; return (ret); @@ -146,4 +328,3 @@ int main (int argc, char* argv[]) { fprintf (stdout, "SPECK SELF TEST RESULT: %u\n", speck_test (0,NULL)); } */ - diff --git a/transform_speck.c b/transform_speck.c index bc299ac..cde6394 100644 --- a/transform_speck.c +++ b/transform_speck.c @@ -32,7 +32,7 @@ typedef unsigned char n2n_speck_ivec_t[N2N_SPECK_IVEC_SIZE]; typedef struct transop_speck { - uint64_t rk[34]; /* the round keys for payload encryption & decryption */ + speck_context_t ctx; /* the round keys for payload encryption & decryption */ } transop_speck_t; /* ****************************************************** */ @@ -99,7 +99,7 @@ static int transop_encode_speck(n2n_trans_op_t * arg, which is (in this case) identical to original packet lentgh */ len = in_len; - speck_ctr (outbuf + TRANSOP_SPECK_PREAMBLE_SIZE, inbuf, in_len, enc_ivec, priv->rk); + speck_ctr (outbuf + TRANSOP_SPECK_PREAMBLE_SIZE, inbuf, in_len, enc_ivec, priv->ctx); traceEvent(TRACE_DEBUG, "encode_speck: encrypted %u bytes.\n", in_len); len += TRANSOP_SPECK_PREAMBLE_SIZE; /* size of data carried in UDP. */ @@ -145,7 +145,7 @@ static int transop_decode_speck(n2n_trans_op_t * arg, htobe64(*(uint64_t*)&dec_ivec[0]), htobe64(*(uint64_t*)&dec_ivec[8]) ); - speck_ctr (outbuf, inbuf + TRANSOP_SPECK_PREAMBLE_SIZE, len, dec_ivec, priv->rk); + speck_ctr (outbuf, inbuf + TRANSOP_SPECK_PREAMBLE_SIZE, len, dec_ivec, priv->ctx); traceEvent(TRACE_DEBUG, "decode_speck: decrypted %u bytes.\n", len); } else @@ -163,7 +163,7 @@ static int setup_speck_key(transop_speck_t *priv, const uint8_t *key, ssize_t ke uint8_t key_mat_buf[32] = { 0x00 }; /* Clear out any old possibly longer key matter. */ - memset(&(priv->rk), 0, sizeof(priv->rk) ); + memset(&(priv->ctx), 0, sizeof(priv->ctx) ); /* TODO: The input key always gets hashed to make a more unpredictable and more complete use of the key space */ // REVISIT: Hash the key to keymat (formerly used: SHA) @@ -173,7 +173,7 @@ static int setup_speck_key(transop_speck_t *priv, const uint8_t *key, ssize_t ke // FOR NOW: USE KEY ITSELF memcpy (key_mat_buf, key, ((key_size>32)?32:key_size) ); - speck_expand_key (key_mat_buf, priv->rk); + speck_expand_key (key_mat_buf, priv->ctx); traceEvent(TRACE_DEBUG, "Speck key setup completed\n"); return(0);