From 11c806231ea9ab494dcf876c55ff9b020d46b0c1 Mon Sep 17 00:00:00 2001 From: Logan007 Date: Thu, 4 Jun 2020 14:52:44 +0545 Subject: [PATCH] added NEON support (and passed ctx by reference) --- speck.c | 219 +++++++++++++++++++++++++++++++++++++++++----- speck.h | 11 ++- transform_speck.c | 4 +- 3 files changed, 209 insertions(+), 25 deletions(-) diff --git a/speck.c b/speck.c index fbda3a9..996e4ab 100644 --- a/speck.c +++ b/speck.c @@ -6,7 +6,7 @@ #include #include -#ifdef __SSE4_2__ // SSE support ---------------------------------------------------- +#if defined (__SSE4_2__)// SSE support ---------------------------------------------------- #include @@ -89,14 +89,14 @@ typedef struct { } speck_context_t; -static int speck_encrypt_xor (unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t ctx, int numbytes) { +static int speck_encrypt_xor (unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t *ctx, int numbytes) { u64 x[2], y[2]; u128 X[4], Y[4], Z[4]; if (numbytes == 16) { x[0] = nonce[1]; y[0] = nonce[0]; nonce[0]++; - Encrypt (x, y, ctx.key, 1); + Encrypt (x, y, ctx->key, 1); ((u64 *)out)[1] = x[0]; ((u64 *)out)[0] = y[0]; return 0; } @@ -104,18 +104,18 @@ static int speck_encrypt_xor (unsigned char *out, const unsigned char *in, u64 n SET1 (X[0], nonce[1]); SET2 (Y[0], nonce[0]); if (numbytes == 32) - Encrypt (X, Y, ctx.rk, 2); + Encrypt (X, Y, ctx->rk, 2); else { X[1] = X[0]; Y[1] = ADD (Y[0], _two); if (numbytes == 64) - Encrypt (X, Y, ctx.rk, 4); + Encrypt (X, Y, ctx->rk, 4); else { X[2] = X[0]; Y[2] = ADD (Y[1], _two); if (numbytes == 96) - Encrypt (X, Y, ctx.rk, 6); + Encrypt (X, Y, ctx->rk, 6); else { X[3] = X[0]; Y[3] = ADD (Y[2], _two); - Encrypt (X, Y, ctx.rk, 8); + Encrypt (X, Y, ctx->rk, 8); } } } @@ -148,7 +148,7 @@ int speck_expand_key (const unsigned char *k, speck_context_t *ctx) { int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long inlen, - const unsigned char *n, speck_context_t ctx) { + const unsigned char *n, speck_context_t *ctx) { int i; u64 nonce[2]; @@ -198,7 +198,186 @@ int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long i } -#else // (close to) C reference code -------------------------------------------- +#elif defined (__ARM_NEON) // NEON support ------------------------------------------- + + +#include + +#define u32 uint32_t +#define u64 uint64_t +#define u128 uint64x2_t + +#define LCS(x,r) (((x)<>(64-r))) +#define RCS(x,r) (((x)>>r)|((x)<<(64-r))) + +#define XOR veorq_u64 +#define AND vandq_u64 +#define ADD vaddq_u64 +#define SL vshlq_n_u64 +#define SR vshrq_n_u64 + +#define SET(a,b) vcombine_u64((uint64x1_t)(a),(uint64x1_t)(b)) +#define SET1(X,c) (X=SET(c,c)) +#define SET2(X,c) (SET1(X,c), X=ADD(X,SET(0x1ll,0x0ll)),c+=2) + +#define LOW(Z) vgetq_lane_u64(Z,0) +#define HIGH(Z) vgetq_lane_u64(Z,1) +#define STORE(ip,X,Y) (((u64 *)(ip))[0]=HIGH(Y), ((u64 *)(ip))[1]=HIGH(X), ((u64 *)(ip))[2]=LOW(Y), ((u64 *)(ip))[3]=LOW(X)) +#define XOR_STORE(in,out,X,Y) (Y=XOR(Y,SET(((u64 *)(in))[2],((u64 *)(in))[0])), X=XOR(X,SET(((u64 *)(in))[3],((u64 *)(in))[1])), STORE(out,X,Y)) + +#define ROR(X,r) vsriq_n_u64(SL(X,(64-r)),X,r) +#define ROL(X,r) ROR(X,(64-r)) + +#define tableR vcreate_u8(0x0007060504030201LL) +#define tableL vcreate_u8(0x0605040302010007LL) +#define ROR8(X) SET(vtbl1_u8((uint8x8_t)vget_low_u64(X),tableR), vtbl1_u8((uint8x8_t)vget_high_u64(X),tableR)) +#define ROL8(X) SET(vtbl1_u8((uint8x8_t)vget_low_u64(X),tableL), vtbl1_u8((uint8x8_t)vget_high_u64(X),tableL)) + +#define numrounds 34 +#define numkeywords 4 + +#define R(X,Y,k) (X=XOR(ADD(ROR8(X),Y),k), Y=XOR(ROL(Y,3),X)) + +#define Rx2(X,Y,k) (R(X[0],Y[0],k)) + +#define Rx4(X,Y,k) (R(X[0],Y[0],k), R(X[1],Y[1],k)) +#define Rx6(X,Y,k) (R(X[0],Y[0],k), R(X[1],Y[1],k), R(X[2],Y[2],k)) +#define Rx8(X,Y,k) (X[0]=ROR8(X[0]), X[0]=ADD(X[0],Y[0]), X[0]=XOR(X[0],k), X[1]=ROR8(X[1]), X[1]=ADD(X[1],Y[1]), X[1]=XOR(X[1],k), \ + X[2]=ROR8(X[2]), X[2]=ADD(X[2],Y[2]), X[2]=XOR(X[2],k), X[3]=ROR8(X[3]), X[3]=ADD(X[3],Y[3]), X[3]=XOR(X[3],k), \ + Z[0]=SL(Y[0],3), Z[1]=SL(Y[1],3), Z[2]=SL(Y[2],3), Z[3]=SL(Y[3],3), \ + Y[0]=SR(Y[0],61), Y[1]=SR(Y[1],61), Y[2]=SR(Y[2],61), Y[3]=SR(Y[3],61), \ + Y[0]=XOR(Y[0],Z[0]), Y[1]=XOR(Y[1],Z[1]), Y[2]=XOR(Y[2],Z[2]), Y[3]=XOR(Y[3],Z[3]), \ + Y[0]=XOR(X[0],Y[0]), Y[1]=XOR(X[1],Y[1]), Y[2]=XOR(X[2],Y[2]), Y[3]=XOR(X[3],Y[3])) + + +#define Rx1(x,y,k) (x[0]=RCS(x[0],8), x[0]+=y[0], x[0]^=k, y[0]=LCS(y[0],3), y[0]^=x[0]) + +#define Rx1b(x,y,k) (x=RCS(x,8), x+=y, x^=k, y=LCS(y,3), y^=x) + +#define Enc(X,Y,k,n) (Rx##n(X,Y,k[0]), Rx##n(X,Y,k[1]), Rx##n(X,Y,k[2]), Rx##n(X,Y,k[3]), Rx##n(X,Y,k[4]), Rx##n(X,Y,k[5]), Rx##n(X,Y,k[6]), Rx##n(X,Y,k[7]), \ + Rx##n(X,Y,k[8]), Rx##n(X,Y,k[9]), Rx##n(X,Y,k[10]), Rx##n(X,Y,k[11]), Rx##n(X,Y,k[12]), Rx##n(X,Y,k[13]), Rx##n(X,Y,k[14]), Rx##n(X,Y,k[15]), \ + Rx##n(X,Y,k[16]), Rx##n(X,Y,k[17]), Rx##n(X,Y,k[18]), Rx##n(X,Y,k[19]), Rx##n(X,Y,k[20]), Rx##n(X,Y,k[21]), Rx##n(X,Y,k[22]), Rx##n(X,Y,k[23]), \ + Rx##n(X,Y,k[24]), Rx##n(X,Y,k[25]), Rx##n(X,Y,k[26]), Rx##n(X,Y,k[27]), Rx##n(X,Y,k[28]), Rx##n(X,Y,k[29]), Rx##n(X,Y,k[30]), Rx##n(X,Y,k[31]), \ + Rx##n(X,Y,k[32]), Rx##n(X,Y,k[33])) + +#define RK(X,Y,k,key,i) (SET1(k[i],Y), key[i]=Y, X=RCS(X,8), X+=Y, X^=i, Y=LCS(Y,3), Y^=X) + +#define EK(A,B,C,D,k,key) (RK(B,A,k,key,0), RK(C,A,k,key,1), RK(D,A,k,key,2), RK(B,A,k,key,3), RK(C,A,k,key,4), RK(D,A,k,key,5), RK(B,A,k,key,6), \ + RK(C,A,k,key,7), RK(D,A,k,key,8), RK(B,A,k,key,9), RK(C,A,k,key,10), RK(D,A,k,key,11), RK(B,A,k,key,12), RK(C,A,k,key,13), \ + RK(D,A,k,key,14), RK(B,A,k,key,15), RK(C,A,k,key,16), RK(D,A,k,key,17), RK(B,A,k,key,18), RK(C,A,k,key,19), RK(D,A,k,key,20), \ + RK(B,A,k,key,21), RK(C,A,k,key,22), RK(D,A,k,key,23), RK(B,A,k,key,24), RK(C,A,k,key,25), RK(D,A,k,key,26), RK(B,A,k,key,27), \ + RK(C,A,k,key,28), RK(D,A,k,key,29), RK(B,A,k,key,30), RK(C,A,k,key,31), RK(D,A,k,key,32), RK(B,A,k,key,33)) + +typedef struct { + u128 rk[34]; + u64 key[34]; +} speck_context_t; + + +int Encrypt_Xor(unsigned char *out, const unsigned char *in, u64 nonce[], speck_context_t *ctx, int numbytes) +{ + + u64 x[2],y[2]; + u128 X[4],Y[4],Z[4]; + + if (numbytes==16){ + x[0]=nonce[1]; y[0]=nonce[0]; nonce[0]++; + Enc(x,y,ctx->key,1); + ((u64 *)out)[1]=x[0]; ((u64 *)out)[0]=y[0]; + + return 0; + } + + + SET1(X[0],nonce[1]); SET2(Y[0],nonce[0]); + + if (numbytes==32) Enc(X,Y,ctx->rk,2); + else{ + X[1]=X[0]; SET2(Y[1],nonce[0]); + if (numbytes==64) Enc(X,Y,ctx->rk,4); + else{ + X[2]=X[0]; SET2(Y[2],nonce[0]); + if (numbytes==96) Enc(X,Y,ctx->rk,6); + else{ + X[3]=X[0]; SET2(Y[3],nonce[0]); + Enc(X,Y,ctx->rk,8); + } + } + } + + XOR_STORE(in,out,X[0],Y[0]); + if (numbytes>=64) XOR_STORE(in+32,out+32,X[1],Y[1]); + if (numbytes>=96) XOR_STORE(in+64,out+64,X[2],Y[2]); + if (numbytes>=128) XOR_STORE(in+96,out+96,X[3],Y[3]); + + return 0; +} + + +int speck_expand_key (const unsigned char *k, speck_context_t *ctx) { + + u64 K[4]; + size_t i; + for(i = 0; i < numkeywords; i++) + K[i] = ((u64 *)k)[i]; + + EK (K[0], K[1], K[2], K[3], ctx->rk, ctx->key); + + return 0; +} + + +int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long inlen, + const unsigned char *n, speck_context_t *ctx) { + + int i; + u64 nonce[2],K[4],key[34],A,B,C,D,x,y; + unsigned char block[16]; + u64 *const block64=(u64 *)block; + u128 rk[34]; + + if (!inlen) return 0; + + nonce[0]=((u64 *)n)[0]; + nonce[1]=((u64 *)n)[1]; + + while(inlen>=128){ + Encrypt_Xor(out,in,nonce,ctx,128); + in+=128; inlen-=128; out+=128; + } + + if (inlen>=96){ + Encrypt_Xor(out,in,nonce,ctx,96); + in+=96; inlen-=96; out+=96; + } + + if (inlen>=64){ + Encrypt_Xor(out,in,nonce,ctx,64); + in+=64; inlen-=64; out+=64; + } + + if (inlen>=32){ + Encrypt_Xor(out,in,nonce,ctx,32); + in+=32; inlen-=32; out+=32; + } + + if (inlen>=16){ + Encrypt_Xor(block,in,nonce,ctx,16); + ((u64 *)out)[0]=block64[0]^((u64 *)in)[0]; + ((u64 *)out)[1]=block64[1]^((u64 *)in)[1]; + in+=16; inlen-=16; out+=16; + } + + if (inlen>0){ + Encrypt_Xor(block,in,nonce,ctx,16); + for(i=0;ikey[i]); *u = x; *v = y; @@ -226,7 +405,7 @@ static int speck_encrypt (u64 *u, u64 *v, speck_context_t ctx) { int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long inlen, - const unsigned char *n, speck_context_t ctx) { + const unsigned char *n, speck_context_t *ctx) { u64 i, nonce[2], x, y, t; unsigned char *block = malloc (16); @@ -260,7 +439,7 @@ int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long i } -int speck_expand_key (const unsigned char *k, speck_context_t * ctx) { +int speck_expand_key (const unsigned char *k, speck_context_t *ctx) { u64 K[4]; u64 i; @@ -281,7 +460,7 @@ int speck_expand_key (const unsigned char *k, speck_context_t * ctx) { } -#endif // SSE, C ref +#endif // SSE, NEON, plain C int speck_test () { @@ -301,19 +480,17 @@ int speck_test () { uint8_t ct[16] = { 0x43, 0x8f, 0x18, 0x9c, 0x8d, 0xb4, 0xee, 0x4e, 0x3e, 0xf5, 0xc0, 0x05, 0x04, 0x01, 0x09, 0x41 }; - - speck_context_t ctx; speck_expand_key (key, &ctx); - speck_ctr (pt, pt, 16, iv, ctx); + speck_ctr (pt, pt, 16, iv, &ctx); u64 i; -fprintf (stderr, "rk00: %016llx\n", ctx.key[0]); -fprintf (stderr, "rk33: %016llx\n", ctx.key[33]); -fprintf (stderr, "out : %016lx\n", *(uint64_t*)pt); -fprintf (stderr, "mem : " ); for (i=0; i < 16; i++) fprintf (stderr, "%02x ", pt[i]); fprintf (stderr, "\n"); +//fprintf (stderr, "rk00: %016llx\n", ctx.key[0]); +//fprintf (stderr, "rk33: %016llx\n", ctx.key[33]); +//fprintf (stderr, "out : %016lx\n", *(uint64_t*)pt); +//fprintf (stderr, "mem : " ); for (i=0; i < 16; i++) fprintf (stderr, "%02x ", pt[i]); fprintf (stderr, "\n"); int ret = 1; for (i=0; i < 16; i++) diff --git a/speck.h b/speck.h index d92d455..a73d6a9 100644 --- a/speck.h +++ b/speck.h @@ -1,13 +1,20 @@ #define u64 uint64_t -#ifdef __SSE4_2__ +#if defined (__SSE4_2__) #include #define u128 __m128i typedef struct { u128 rk[34]; u64 key[34]; } speck_context_t; +#elif defined (__ARM_NEON) + #include + #define u128 uint64x2_t +typedef struct { + u128 rk[34]; + u64 key[34]; +} speck_context_t; #else typedef struct { u64 key[34]; @@ -17,6 +24,6 @@ typedef struct { int speck_ctr (unsigned char *out, const unsigned char *in, unsigned long long inlen, const unsigned char *n, - speck_context_t ctx); + speck_context_t *ctx); int speck_expand_key (const unsigned char *k, speck_context_t *ctx); diff --git a/transform_speck.c b/transform_speck.c index d411a41..3cb712a 100644 --- a/transform_speck.c +++ b/transform_speck.c @@ -99,7 +99,7 @@ static int transop_encode_speck(n2n_trans_op_t * arg, which is (in this case) identical to original packet lentgh */ len = in_len; - speck_ctr (outbuf + TRANSOP_SPECK_PREAMBLE_SIZE, inbuf, in_len, enc_ivec, priv->ctx); + speck_ctr (outbuf + TRANSOP_SPECK_PREAMBLE_SIZE, inbuf, in_len, enc_ivec, &(priv->ctx)); traceEvent(TRACE_DEBUG, "encode_speck: encrypted %u bytes.\n", in_len); len += TRANSOP_SPECK_PREAMBLE_SIZE; /* size of data carried in UDP. */ @@ -145,7 +145,7 @@ static int transop_decode_speck(n2n_trans_op_t * arg, htobe64(*(uint64_t*)&dec_ivec[0]), htobe64(*(uint64_t*)&dec_ivec[8]) ); - speck_ctr (outbuf, inbuf + TRANSOP_SPECK_PREAMBLE_SIZE, len, dec_ivec, priv->ctx); + speck_ctr (outbuf, inbuf + TRANSOP_SPECK_PREAMBLE_SIZE, len, dec_ivec, &(priv->ctx)); traceEvent(TRACE_DEBUG, "decode_speck: decrypted %u bytes.\n", len); } else