readability code clean-up (#530)

4 years ago · 9fbe941511
2 changed files with 424 additions and 405 deletions
--- a/include/tf.h
+++ b/include/tf.h
@ -22,42 +22,44 @@
 // published on github/drewcsillag/twofish
-/*
+/**
-The MIT License (MIT)
+ * The MIT License (MIT)
-
+ *
-Copyright (c) 2015 Andrew T. Csillag
+ * Copyright (c) 2015 Andrew T. Csillag
-
+ *
-Permission is hereby granted, free of charge, to any person obtaining a copy
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
+ * of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
+ * in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
+ * copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+ * furnished to do so, subject to the following conditions:
-
+ *
-The above copyright notice and this permission notice shall be included in
+ * The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
+ * all copies or substantial portions of the Software.
-
+ *
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+ * THE SOFTWARE.
-*/
+ */
 #ifndef TF_H
 #define TF_H
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "portable_endian.h"
-#define TF_BLOCK_SIZE           16
+#define TF_BLOCK_SIZE     16
-#define TF_IV_SIZE             (TF_BLOCK_SIZE)
+#define TF_IV_SIZE       (TF_BLOCK_SIZE)
 typedef struct tf_context_t {
@ -82,4 +84,4 @@ int tf_init (const unsigned char *key, size_t key_size, tf_context_t **ctx);
 int tf_deinit (tf_context_t *ctx);
-#endif    // TF_H
+#endif // TF_H
--- a/src/tf.c
+++ b/src/tf.c
@ -22,29 +22,29 @@
 // published on github/drewcsillag/twofish
-/*
+/**
-The MIT License (MIT)
+ * The MIT License (MIT)
-
+ *
-Copyright (c) 2015 Andrew T. Csillag
+ * Copyright (c) 2015 Andrew T. Csillag
-
+ *
-Permission is hereby granted, free of charge, to any person obtaining a copy
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
+ * of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
+ * in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
+ * copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+ * furnished to do so, subject to the following conditions:
-
+ *
-The above copyright notice and this permission notice shall be included in
+ * The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
+ * all copies or substantial portions of the Software.
-
+ *
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+ * THE SOFTWARE.
-*/
+ */
 #include "tf.h"
@ -123,6 +123,7 @@ const uint8_t multEF[] = { 0x00, 0xEF, 0xB7, 0x58, 0x07, 0xE8, 0xB0, 0x5F, 0x0E,
                           0xA8, 0x47, 0x1F, 0xF0, 0xAF, 0x40, 0x18, 0xF7, 0xA6, 0x49, 0x11, 0xFE, 0xA1, 0x4E, 0x16, 0xF9,
                           0xB4, 0x5B, 0x03, 0xEC, 0xB3, 0x5C, 0x04, 0xEB, 0xBA, 0x55, 0x0D, 0xE2, 0xBD, 0x52, 0x0A, 0xE5 };
 #define RS_MOD 0x14D
 #define RHO 0x01010101L
@ -140,463 +141,479 @@ const uint8_t multEF[] = { 0x00, 0xEF, 0xB7, 0x58, 0x07, 0xE8, 0xB0, 0x5F, 0x0E,
 #define U8S_TO_U32(r0, r1, r2, r3) ((r0 << 24) ^ (r1 << 16) ^ (r2 << 8) ^ r3)
-/* multiply two polynomials represented as u32's, actually called with bytes */
+// multiply two polynomials represented as u32's, actually called with bytes
 uint32_t polyMult(uint32_t a, uint32_t b) {
-  uint32_t t=0;
+    uint32_t t=0;
-  while(a) {
+    while(a) {
-    if(a&1) t^=b;
+        if(a & 1)
-    b <<= 1;
+            t^=b;
-    a >>= 1;
+        b <<= 1;
-  }
+        a >>= 1;
    }
-  return t;
+    return t;
 }
-/* take the polynomial t and return the t % modulus in GF(256) */
+// take the polynomial t and return the t % modulus in GF(256)
 uint32_t gfMod(uint32_t t, uint32_t modulus) {
-  int i;
+    int i;
-  uint32_t tt;
+    uint32_t tt;
-  modulus <<= 7;
+    modulus <<= 7;
-  for(i = 0; i < 8; i++) {
+    for(i = 0; i < 8; i++) {
-    tt = t ^ modulus;
+        tt = t ^ modulus;
-    if(tt < t) t = tt;
+        if(tt < t)
-    modulus >>= 1;
+             t = tt;
-  }
+        modulus >>= 1;
    }
-  return t;
+    return t;
 }
-/*multiply a and b and return the modulus */
+// multiply a and b and return the modulus
 #define gfMult(a, b, modulus) gfMod(polyMult(a, b), modulus)
-/* return a u32 containing the result of multiplying the RS Code matrix by the sd matrix */
+// return a u32 containing the result of multiplying the RS Code matrix by the sd matrix
 uint32_t RSMatrixMultiply(uint8_t sd[8]) {
-  int j, k;
+    int j, k;
-  uint8_t t;
+    uint8_t t;
-  uint8_t result[4];
+    uint8_t result[4];
-  for(j = 0; j < 4; j++) {
+    for(j = 0; j < 4; j++) {
-    t = 0;
+        t = 0;
-    for(k = 0; k < 8; k++) {
+        for(k = 0; k < 8; k++) {
-      t ^= gfMult(RS[j][k], sd[k], RS_MOD);
+            t ^= gfMult(RS[j][k], sd[k], RS_MOD);
        }
        result[3-j] = t;
    }
    result[3-j] = t;
  }
-  return U8ARRAY_TO_U32(result);
+    return U8ARRAY_TO_U32(result);
 }
-/* the Zero-keyed h function (used by the key setup routine) */
+// the Zero-keyed h function (used by the key setup routine)
 uint32_t h(uint32_t X, uint32_t L[4], int k) {
-  uint8_t y0, y1, y2, y3;
+    uint8_t y0, y1, y2, y3;
-  uint8_t z0, z1, z2, z3;
+    uint8_t z0, z1, z2, z3;
-
+
-  y0 = b0(X);
+    y0 = b0(X);
-  y1 = b1(X);
+    y1 = b1(X);
-  y2 = b2(X);
+    y2 = b2(X);
-  y3 = b3(X);
+    y3 = b3(X);
-
+
-  switch(k) {
+    switch(k) {
-    case 4:
+        case 4:
-      y0 = Q1[y0] ^ b0(L[3]);
+            y0 = Q1[y0] ^ b0(L[3]);
-      y1 = Q0[y1] ^ b1(L[3]);
+            y1 = Q0[y1] ^ b1(L[3]);
-      y2 = Q0[y2] ^ b2(L[3]);
+            y2 = Q0[y2] ^ b2(L[3]);
-      y3 = Q1[y3] ^ b3(L[3]);
+            y3 = Q1[y3] ^ b3(L[3]);
-    case 3:
+        case 3:
-      y0 = Q1[y0] ^ b0(L[2]);
+            y0 = Q1[y0] ^ b0(L[2]);
-      y1 = Q1[y1] ^ b1(L[2]);
+            y1 = Q1[y1] ^ b1(L[2]);
-      y2 = Q0[y2] ^ b2(L[2]);
+            y2 = Q0[y2] ^ b2(L[2]);
-      y3 = Q0[y3] ^ b3(L[2]);
+            y3 = Q0[y3] ^ b3(L[2]);
-    case 2:
+        case 2:
-      y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
+            y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
-      y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
+            y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
-      y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
+            y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
-      y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
+            y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
-  }
+    }
-
+
-  /* inline the MDS matrix multiply */
+    // inline the MDS matrix multiply
-  z0 = multEF[y0] ^ y1 ^         multEF[y2] ^ mult5B[y3];
+    z0 = multEF[y0] ^ y1 ^         multEF[y2] ^ mult5B[y3];
-  z1 = multEF[y0] ^ mult5B[y1] ^ y2 ^         multEF[y3];
+    z1 = multEF[y0] ^ mult5B[y1] ^ y2 ^         multEF[y3];
-  z2 = mult5B[y0] ^ multEF[y1] ^ multEF[y2] ^ y3;
+    z2 = mult5B[y0] ^ multEF[y1] ^ multEF[y2] ^ y3;
-  z3 = y0 ^         multEF[y1] ^ mult5B[y2] ^ mult5B[y3];
+    z3 = y0 ^         multEF[y1] ^ mult5B[y2] ^ mult5B[y3];
-
+
-  return U8S_TO_U32(z0, z1, z2, z3);
+    return U8S_TO_U32(z0, z1, z2, z3);
 }
-/* given the Sbox keys, create the fully keyed QF */
+// given the Sbox keys, create the fully keyed QF
 void fullKey(uint32_t L[4], int k, uint32_t QF[4][256]) {
-  uint8_t y0, y1, y2, y3;
+    uint8_t y0, y1, y2, y3;
-  int i;
+    int i;
-
+
-  /* for all input values to the Q permutations */
+    // for all input values to the Q permutations
-  for(i=0; i<256; i++) {
+    for(i = 0; i < 256; i++) {
-    /* run the Q permutations */
+        // run the Q permutations
-    y0 = i; y1=i; y2=i; y3=i;
+        y0 = i; y1 = i; y2 = i; y3 = i;
-    switch(k) {
+        switch(k) {
-      case 4:
+            case 4:
-        y0 = Q1[y0] ^ b0(L[3]);
+                y0 = Q1[y0] ^ b0(L[3]);
-        y1 = Q0[y1] ^ b1(L[3]);
+                y1 = Q0[y1] ^ b1(L[3]);
-        y2 = Q0[y2] ^ b2(L[3]);
+                y2 = Q0[y2] ^ b2(L[3]);
-        y3 = Q1[y3] ^ b3(L[3]);
+                y3 = Q1[y3] ^ b3(L[3]);
-      case 3:
+            case 3:
-        y0 = Q1[y0] ^ b0(L[2]);
+                y0 = Q1[y0] ^ b0(L[2]);
-        y1 = Q1[y1] ^ b1(L[2]);
+                y1 = Q1[y1] ^ b1(L[2]);
-        y2 = Q0[y2] ^ b2(L[2]);
+                y2 = Q0[y2] ^ b2(L[2]);
-        y3 = Q0[y3] ^ b3(L[2]);
+                y3 = Q0[y3] ^ b3(L[2]);
-      case 2:
+            case 2:
-        y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
+                y0 = Q1[  Q0 [ Q0[y0] ^ b0(L[1]) ] ^ b0(L[0]) ];
-        y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
+                y1 = Q0[  Q0 [ Q1[y1] ^ b1(L[1]) ] ^ b1(L[0]) ];
-        y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
+                y2 = Q1[  Q1 [ Q0[y2] ^ b2(L[1]) ] ^ b2(L[0]) ];
-        y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
+                y3 = Q0[  Q1 [ Q1[y3] ^ b3(L[1]) ] ^ b3(L[0]) ];
        }
        // now do the partial MDS matrix multiplies
        QF[0][i] = ((multEF[y0] << 24)
                 | (multEF[y0] << 16)
                 | (mult5B[y0] << 8)
                 | y0);
        QF[1][i] = ((y1 << 24)
                 | (mult5B[y1] << 16)
                 | (multEF[y1] << 8)
                 | multEF[y1]);
        QF[2][i] = ((multEF[y2] << 24)
                 | (y2 << 16)
                 | (multEF[y2] << 8)
                 | mult5B[y2]);
        QF[3][i] = ((mult5B[y3] << 24)
                 | (multEF[y3] << 16)
                 | (y3 << 8)
                 | mult5B[y3]);
    }
    /* now do the partial MDS matrix multiplies */
    QF[0][i] = ((multEF[y0] << 24)
             | (multEF[y0] << 16)
             | (mult5B[y0] << 8)
             | y0);
    QF[1][i] = ((y1 << 24)
             | (mult5B[y1] << 16)
             | (multEF[y1] << 8)
             | multEF[y1]);
    QF[2][i] = ((multEF[y2] << 24)
             | (y2 << 16)
             | (multEF[y2] << 8)
             | mult5B[y2]);
    QF[3][i] = ((mult5B[y3] << 24)
             | (multEF[y3] << 16)
             | (y3 << 8)
             | mult5B[y3]);
  }
 }
-// -------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------------------------------
-/* fully keyed h (aka g) function */
+// fully keyed h (aka g) function
 #define fkh(X) (ctx->QF[0][b0(X)]^ctx->QF[1][b1(X)]^ctx->QF[2][b2(X)]^ctx->QF[3][b3(X)])
 // -------------------------------------------------------------------------------------
-/* one encryption round */
+// ----------------------------------------------------------------------------------------------------------------
 // one encryption round
 #define ENC_ROUND(R0, R1, R2, R3, round) \
-  T0 = fkh(R0); \
+    T0 = fkh(R0); \
-  T1 = fkh(ROL(R1, 8)); \
+    T1 = fkh(ROL(R1, 8)); \
-  R2 = ROR(R2 ^ (T1 + T0 + ctx->K[2*round+8]), 1); \
+    R2 = ROR(R2 ^ (T1 + T0 + ctx->K[2*round+8]), 1); \
-  R3 = ROL(R3, 1) ^ (2*T1 + T0 + ctx->K[2*round+9]);
+    R3 = ROL(R3, 1) ^ (2*T1 + T0 + ctx->K[2*round+9]);
 void twofish_internal_encrypt(uint8_t PT[16], tf_context_t *ctx) {
-  uint32_t R0, R1, R2, R3;
+    uint32_t R0, R1, R2, R3;
-  uint32_t T0, T1;
+    uint32_t T0, T1;
-
+
-  /* load/byteswap/whiten input */
+    // load/byteswap/whiten input
-  R3 = ctx->K[3] ^ le32toh(((uint32_t*)PT)[3]);
+    R3 = ctx->K[3] ^ le32toh(((uint32_t*)PT)[3]);
-  R2 = ctx->K[2] ^ le32toh(((uint32_t*)PT)[2]);
+    R2 = ctx->K[2] ^ le32toh(((uint32_t*)PT)[2]);
-  R1 = ctx->K[1] ^ le32toh(((uint32_t*)PT)[1]);
+    R1 = ctx->K[1] ^ le32toh(((uint32_t*)PT)[1]);
-  R0 = ctx->K[0] ^ le32toh(((uint32_t*)PT)[0]);
+    R0 = ctx->K[0] ^ le32toh(((uint32_t*)PT)[0]);
-
+
-  ENC_ROUND(R0, R1, R2, R3, 0);
+    ENC_ROUND(R0, R1, R2, R3,  0);
-  ENC_ROUND(R2, R3, R0, R1, 1);
+    ENC_ROUND(R2, R3, R0, R1,  1);
-  ENC_ROUND(R0, R1, R2, R3, 2);
+    ENC_ROUND(R0, R1, R2, R3,  2);
-  ENC_ROUND(R2, R3, R0, R1, 3);
+    ENC_ROUND(R2, R3, R0, R1,  3);
-  ENC_ROUND(R0, R1, R2, R3, 4);
+    ENC_ROUND(R0, R1, R2, R3,  4);
-  ENC_ROUND(R2, R3, R0, R1, 5);
+    ENC_ROUND(R2, R3, R0, R1,  5);
-  ENC_ROUND(R0, R1, R2, R3, 6);
+    ENC_ROUND(R0, R1, R2, R3,  6);
-  ENC_ROUND(R2, R3, R0, R1, 7);
+    ENC_ROUND(R2, R3, R0, R1,  7);
-  ENC_ROUND(R0, R1, R2, R3, 8);
+    ENC_ROUND(R0, R1, R2, R3,  8);
-  ENC_ROUND(R2, R3, R0, R1, 9);
+    ENC_ROUND(R2, R3, R0, R1,  9);
-  ENC_ROUND(R0, R1, R2, R3, 10);
+    ENC_ROUND(R0, R1, R2, R3, 10);
-  ENC_ROUND(R2, R3, R0, R1, 11);
+    ENC_ROUND(R2, R3, R0, R1, 11);
-  ENC_ROUND(R0, R1, R2, R3, 12);
+    ENC_ROUND(R0, R1, R2, R3, 12);
-  ENC_ROUND(R2, R3, R0, R1, 13);
+    ENC_ROUND(R2, R3, R0, R1, 13);
-  ENC_ROUND(R0, R1, R2, R3, 14);
+    ENC_ROUND(R0, R1, R2, R3, 14);
-  ENC_ROUND(R2, R3, R0, R1, 15);
+    ENC_ROUND(R2, R3, R0, R1, 15);
-
+
-  /* load/byteswap/whiten output */
+    // whiten/byteswap/store output
-  ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[7]);
+    ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[7]);
-  ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[6]);
+    ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[6]);
-  ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[5]);
+    ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[5]);
-  ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[4]);
+    ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[4]);
 }
 // -------------------------------------------------------------------------------------
-/* one decryption round */
+// ----------------------------------------------------------------------------------------------------------------
 // one decryption round
 #define DEC_ROUND(R0, R1, R2, R3, round) \
-  T0 = fkh(R0); \
+    T0 = fkh(R0); \
-  T1 = fkh(ROL(R1, 8)); \
+    T1 = fkh(ROL(R1, 8)); \
-  R2 = ROL(R2, 1) ^ (T0 + T1 + ctx->K[2*round+8]); \
+    R2 = ROL(R2, 1) ^ (T0 + T1 + ctx->K[2*round+8]); \
-  R3 = ROR(R3 ^ (T0 + 2*T1 + ctx->K[2*round+9]), 1);
+    R3 = ROR(R3 ^ (T0 + 2*T1 + ctx->K[2*round+9]), 1);
 void twofish_internal_decrypt(uint8_t PT[16], const uint8_t CT[16], tf_context_t *ctx) {
-  uint32_t T0, T1;
+    uint32_t T0, T1;
-  uint32_t R0, R1, R2, R3;
+    uint32_t R0, R1, R2, R3;
-
+
-  /* load/byteswap/whiten input */
+    // load/byteswap/whiten input
-  R3 = ctx->K[7] ^ le32toh(((uint32_t*)CT)[3]);
+    R3 = ctx->K[7] ^ le32toh(((uint32_t*)CT)[3]);
-  R2 = ctx->K[6] ^ le32toh(((uint32_t*)CT)[2]);
+    R2 = ctx->K[6] ^ le32toh(((uint32_t*)CT)[2]);
-  R1 = ctx->K[5] ^ le32toh(((uint32_t*)CT)[1]);
+    R1 = ctx->K[5] ^ le32toh(((uint32_t*)CT)[1]);
-  R0 = ctx->K[4] ^ le32toh(((uint32_t*)CT)[0]);
+    R0 = ctx->K[4] ^ le32toh(((uint32_t*)CT)[0]);
-
+
-  DEC_ROUND(R0, R1, R2, R3, 15);
+    DEC_ROUND(R0, R1, R2, R3, 15);
-  DEC_ROUND(R2, R3, R0, R1, 14);
+    DEC_ROUND(R2, R3, R0, R1, 14);
-  DEC_ROUND(R0, R1, R2, R3, 13);
+    DEC_ROUND(R0, R1, R2, R3, 13);
-  DEC_ROUND(R2, R3, R0, R1, 12);
+    DEC_ROUND(R2, R3, R0, R1, 12);
-  DEC_ROUND(R0, R1, R2, R3, 11);
+    DEC_ROUND(R0, R1, R2, R3, 11);
-  DEC_ROUND(R2, R3, R0, R1, 10);
+    DEC_ROUND(R2, R3, R0, R1, 10);
-  DEC_ROUND(R0, R1, R2, R3, 9);
+    DEC_ROUND(R0, R1, R2, R3,  9);
-  DEC_ROUND(R2, R3, R0, R1, 8);
+    DEC_ROUND(R2, R3, R0, R1,  8);
-  DEC_ROUND(R0, R1, R2, R3, 7);
+    DEC_ROUND(R0, R1, R2, R3,  7);
-  DEC_ROUND(R2, R3, R0, R1, 6);
+    DEC_ROUND(R2, R3, R0, R1,  6);
-  DEC_ROUND(R0, R1, R2, R3, 5);
+    DEC_ROUND(R0, R1, R2, R3,  5);
-  DEC_ROUND(R2, R3, R0, R1, 4);
+    DEC_ROUND(R2, R3, R0, R1,  4);
-  DEC_ROUND(R0, R1, R2, R3, 3);
+    DEC_ROUND(R0, R1, R2, R3,  3);
-  DEC_ROUND(R2, R3, R0, R1, 2);
+    DEC_ROUND(R2, R3, R0, R1,  2);
-  DEC_ROUND(R0, R1, R2, R3, 1);
+    DEC_ROUND(R0, R1, R2, R3,  1);
-  DEC_ROUND(R2, R3, R0, R1, 0);
+    DEC_ROUND(R2, R3, R0, R1,  0);
-
+
-  /* load/byteswap/whiten output */
+    // whiten/byteswap/store output
-  ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[3]);
+    ((uint32_t*)PT)[3] = htole32(R1 ^ ctx->K[3]);
-  ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[2]);
+    ((uint32_t*)PT)[2] = htole32(R0 ^ ctx->K[2]);
-  ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[1]);
+    ((uint32_t*)PT)[1] = htole32(R3 ^ ctx->K[1]);
-  ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[0]);
+    ((uint32_t*)PT)[0] = htole32(R2 ^ ctx->K[0]);
 }
 // -------------------------------------------------------------------------------------
-/* the key schedule routine */
+
 // the key schedule routine
 void keySched(const uint8_t M[], int N, uint32_t **S, uint32_t K[40], int *k) {
-  uint32_t Mo[4], Me[4];
+    uint32_t Mo[4], Me[4];
-  int i, j;
+    int i, j;
-  uint8_t vector[8];
+    uint8_t vector[8];
-  uint32_t A, B;
+    uint32_t A, B;
-
+
-  *k = (N + 63) / 64;
+    *k = (N + 63) / 64;
-  *S = (uint32_t*)malloc(sizeof(uint32_t) * (*k));
+    *S = (uint32_t*)malloc(sizeof(uint32_t) * (*k));
-
+
-  for(i = 0; i < *k; i++) {
+    for(i = 0; i < *k; i++) {
-    Me[i] = le32toh(((uint32_t*)M)[2*i]);
+        Me[i] = le32toh(((uint32_t*)M)[2*i]);
-    Mo[i] = le32toh(((uint32_t*)M)[2*i+1]);
+        Mo[i] = le32toh(((uint32_t*)M)[2*i+1]);
-  }
+    }
-
+
-  for(i = 0; i < *k; i++) {
+    for(i = 0; i < *k; i++) {
-    for(j = 0; j < 4; j++)
+        for(j = 0; j < 4; j++)
-      vector[j] = _b(Me[i], j);
+            vector[j] = _b(Me[i], j);
-    for(j = 0; j < 4; j++)
+        for(j = 0; j < 4; j++)
-      vector[j+4] = _b(Mo[i], j);
+            vector[j+4] = _b(Mo[i], j);
-    (*S)[(*k)-i-1] = RSMatrixMultiply(vector);
+        (*S)[(*k)-i-1] = RSMatrixMultiply(vector);
-  }
+    }
-  for(i = 0; i < 20; i++) {
+
-    A = h(2*i*RHO, Me, *k);
+    for(i = 0; i < 20; i++) {
-    B = ROL(h(2*i*RHO + RHO, Mo, *k), 8);
+        A = h(2*i*RHO, Me, *k);
-    K[2*i] = A+B;
+        B = ROL(h(2*i*RHO + RHO, Mo, *k), 8);
-    K[2*i+1] = ROL(A + 2*B, 9);
+        K[2*i] = A+B;
-  }
+        K[2*i+1] = ROL(A + 2*B, 9);
    }
 }
-// -------------------------------------------------------------------------------------
+
 // ----------------------------------------------------------------------------------------------------------------
 #define fix_xor(target, source) *(uint32_t*)&(target)[0] = *(uint32_t*)&(target)[0] ^ *(uint32_t*)&(source)[0]; *(uint32_t*)&(target)[4] = *(uint32_t*)&(target)[4] ^ *(uint32_t*)&(source)[4]; \
                                *(uint32_t*)&(target)[8] = *(uint32_t*)&(target)[8] ^ *(uint32_t*)&(source)[8]; *(uint32_t*)&(target)[12] = *(uint32_t*)&(target)[12] ^ *(uint32_t*)&(source)[12];
-// -------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------------------------------
-/** public API **/
+// public API
 int tf_ecb_decrypt (unsigned char *out, const unsigned char *in, tf_context_t *ctx) {
-  twofish_internal_decrypt(out, in, ctx);
+    twofish_internal_decrypt(out, in, ctx);
-  return TF_BLOCK_SIZE;
+
    return TF_BLOCK_SIZE;
 }
 // not used
 int tf_ecb_encrypt (unsigned char *out, const unsigned char *in, tf_context_t *ctx) {
-  memcpy (out, in, TF_BLOCK_SIZE);
+    memcpy(out, in, TF_BLOCK_SIZE);
-  twofish_internal_encrypt(out, ctx);
+    twofish_internal_encrypt(out, ctx);
-  return TF_BLOCK_SIZE;
+
    return TF_BLOCK_SIZE;
 }
 int tf_cbc_encrypt (unsigned char *out, const unsigned char *in, size_t in_len,
                    const unsigned char *iv, tf_context_t *ctx) {
-  uint8_t tmp[TF_BLOCK_SIZE];
+    uint8_t tmp[TF_BLOCK_SIZE];
-  size_t i;
+    size_t i;
-  size_t n;
+    size_t n;
    memcpy(tmp, iv, TF_BLOCK_SIZE);
-  memcpy(tmp, iv, TF_BLOCK_SIZE);
+    n = in_len / TF_BLOCK_SIZE;
    for(i = 0; i < n; i++) {
        fix_xor(tmp, &in[i * TF_BLOCK_SIZE]);
        twofish_internal_encrypt(tmp, ctx);
        memcpy(&out[i * TF_BLOCK_SIZE], tmp, TF_BLOCK_SIZE);
    }
-  n = in_len / TF_BLOCK_SIZE;
+    return n * TF_BLOCK_SIZE;
  for(i=0; i < n; i++) {
    fix_xor(tmp, &in[i * TF_BLOCK_SIZE]);
    twofish_internal_encrypt(tmp, ctx);
    memcpy(&out[i * TF_BLOCK_SIZE], tmp, TF_BLOCK_SIZE);
  }
  return n * TF_BLOCK_SIZE;
 }
 int tf_cbc_decrypt (unsigned char *out, const unsigned char *in, size_t in_len,
                    const unsigned char *iv, tf_context_t *ctx) {
-  int n;                       // number of blocks
+    int n;                       /* number of blocks */
-  int ret = (int)in_len & 15;  // remainder
+    int ret = (int)in_len & 15;  /* remainder        */
-
+
-  uint8_t ivec[TF_BLOCK_SIZE]; // the ivec/old handling might be optimized if we
+    uint8_t ivec[TF_BLOCK_SIZE]; /* the ivec/old handling might be optimized if we */
-  uint8_t old[TF_BLOCK_SIZE];  // can be sure that in != out
+    uint8_t old[TF_BLOCK_SIZE];  /* can be sure that in != out                     */
-
+
-  memcpy(ivec, iv, TF_BLOCK_SIZE);
+    memcpy(ivec, iv, TF_BLOCK_SIZE);
-
+
-  for(n = in_len / TF_BLOCK_SIZE; n > 2; n -=3) {
+    // 3 parallel rails of twofish decryption
    for(n = in_len / TF_BLOCK_SIZE; n > 2; n -=3) {
        memcpy(old, in + 2 * TF_BLOCK_SIZE, TF_BLOCK_SIZE);
        uint32_t T0, T1;
        uint32_t Q0, Q1, Q2, Q3, R0, R1, R2, R3, S0, S1, S2, S3;
        // load/byteswap/whiten input/iv
        Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
        Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
        Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
        Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);
        R3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[7]);
        R2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[6]);
        R1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[5]);
        R0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[4]);
        S3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[11]);
        S2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[10]);
        S1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[9]);
        S0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[8]);
        DEC_ROUND(Q0, Q1, Q2, Q3, 15); DEC_ROUND(R0, R1, R2, R3, 15); DEC_ROUND(S0, S1, S2, S3, 15);
        DEC_ROUND(Q2, Q3, Q0, Q1, 14); DEC_ROUND(R2, R3, R0, R1, 14); DEC_ROUND(S2, S3, S0, S1, 14);
        DEC_ROUND(Q0, Q1, Q2, Q3, 13); DEC_ROUND(R0, R1, R2, R3, 13); DEC_ROUND(S0, S1, S2, S3, 13);
        DEC_ROUND(Q2, Q3, Q0, Q1, 12); DEC_ROUND(R2, R3, R0, R1, 12); DEC_ROUND(S2, S3, S0, S1, 12);
        DEC_ROUND(Q0, Q1, Q2, Q3, 11); DEC_ROUND(R0, R1, R2, R3, 11); DEC_ROUND(S0, S1, S2, S3, 11);
        DEC_ROUND(Q2, Q3, Q0, Q1, 10); DEC_ROUND(R2, R3, R0, R1, 10); DEC_ROUND(S2, S3, S0, S1, 10);
        DEC_ROUND(Q0, Q1, Q2, Q3,  9); DEC_ROUND(R0, R1, R2, R3,  9); DEC_ROUND(S0, S1, S2, S3,  9);
        DEC_ROUND(Q2, Q3, Q0, Q1,  8); DEC_ROUND(R2, R3, R0, R1,  8); DEC_ROUND(S2, S3, S0, S1,  8);
        DEC_ROUND(Q0, Q1, Q2, Q3,  7); DEC_ROUND(R0, R1, R2, R3,  7); DEC_ROUND(S0, S1, S2, S3,  7);
        DEC_ROUND(Q2, Q3, Q0, Q1,  6); DEC_ROUND(R2, R3, R0, R1,  6); DEC_ROUND(S2, S3, S0, S1,  6);
        DEC_ROUND(Q0, Q1, Q2, Q3,  5); DEC_ROUND(R0, R1, R2, R3,  5); DEC_ROUND(S0, S1, S2, S3,  5);
        DEC_ROUND(Q2, Q3, Q0, Q1,  4); DEC_ROUND(R2, R3, R0, R1,  4); DEC_ROUND(S2, S3, S0, S1,  4);
        DEC_ROUND(Q0, Q1, Q2, Q3,  3); DEC_ROUND(R0, R1, R2, R3,  3); DEC_ROUND(S0, S1, S2, S3,  3);
        DEC_ROUND(Q2, Q3, Q0, Q1,  2); DEC_ROUND(R2, R3, R0, R1,  2); DEC_ROUND(S2, S3, S0, S1,  2);
        DEC_ROUND(Q0, Q1, Q2, Q3,  1); DEC_ROUND(R0, R1, R2, R3,  1); DEC_ROUND(S0, S1, S2, S3,  1);
        DEC_ROUND(Q2, Q3, Q0, Q1,  0); DEC_ROUND(R2, R3, R0, R1,  0); DEC_ROUND(S2, S3, S0, S1,  0);
        // whiten/byteswap/store output/iv
        ((uint32_t*)out)[11] = htole32(S1 ^ ctx->K[3] ^ ((uint32_t*)in)[7]);
        ((uint32_t*)out)[10] = htole32(S0 ^ ctx->K[2] ^ ((uint32_t*)in)[6]);
        ((uint32_t*)out)[9]  = htole32(S3 ^ ctx->K[1] ^ ((uint32_t*)in)[5]);
        ((uint32_t*)out)[8]  = htole32(S2 ^ ctx->K[0] ^ ((uint32_t*)in)[4]);
        ((uint32_t*)out)[7]  = htole32(R1 ^ ctx->K[3] ^ ((uint32_t*)in)[3]);
        ((uint32_t*)out)[6]  = htole32(R0 ^ ctx->K[2] ^ ((uint32_t*)in)[2]);
        ((uint32_t*)out)[5]  = htole32(R3 ^ ctx->K[1] ^ ((uint32_t*)in)[1]);
        ((uint32_t*)out)[4]  = htole32(R2 ^ ctx->K[0] ^ ((uint32_t*)in)[0]);
        ((uint32_t*)out)[3]  = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
        ((uint32_t*)out)[2]  = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
        ((uint32_t*)out)[1]  = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
        ((uint32_t*)out)[0]  = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);
        in += 3 * TF_BLOCK_SIZE; out += 3 * TF_BLOCK_SIZE;
        memcpy(ivec, old, TF_BLOCK_SIZE);
    }
-    memcpy(old, in + 2 * TF_BLOCK_SIZE, TF_BLOCK_SIZE);
+    // handle the two or less remaining block on a single rail
    for(; n != 0; n--) {
        uint32_t T0, T1;
        uint32_t Q0, Q1, Q2, Q3;
        memcpy(old, in, TF_BLOCK_SIZE);
        // load/byteswap/whiten input
        Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
        Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
        Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
        Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);
        DEC_ROUND(Q0, Q1, Q2, Q3, 15);
        DEC_ROUND(Q2, Q3, Q0, Q1, 14);
        DEC_ROUND(Q0, Q1, Q2, Q3, 13);
        DEC_ROUND(Q2, Q3, Q0, Q1, 12);
        DEC_ROUND(Q0, Q1, Q2, Q3, 11);
        DEC_ROUND(Q2, Q3, Q0, Q1, 10);
        DEC_ROUND(Q0, Q1, Q2, Q3,  9);
        DEC_ROUND(Q2, Q3, Q0, Q1,  8);
        DEC_ROUND(Q0, Q1, Q2, Q3,  7);
        DEC_ROUND(Q2, Q3, Q0, Q1,  6);
        DEC_ROUND(Q0, Q1, Q2, Q3,  5);
        DEC_ROUND(Q2, Q3, Q0, Q1,  4);
        DEC_ROUND(Q0, Q1, Q2, Q3,  3);
        DEC_ROUND(Q2, Q3, Q0, Q1,  2);
        DEC_ROUND(Q0, Q1, Q2, Q3,  1);
        DEC_ROUND(Q2, Q3, Q0, Q1,  0);
        // load/byteswap/whiten output/iv
        ((uint32_t*)out)[3] = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
        ((uint32_t*)out)[2] = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
        ((uint32_t*)out)[1] = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
        ((uint32_t*)out)[0] = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);
        in += TF_BLOCK_SIZE; out+= TF_BLOCK_SIZE;
        memcpy(ivec, old, TF_BLOCK_SIZE);
    }
-    uint32_t T0, T1;
+    return n * TF_BLOCK_SIZE;
    uint32_t Q0, Q1, Q2, Q3, R0, R1, R2, R3, S0, S1, S2, S3;
    /* load/byteswap/whiten input/iv */
    Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
    Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
    Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
    Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);
    R3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[7]);
    R2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[6]);
    R1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[5]);
    R0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[4]);
    S3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[11]);
    S2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[10]);
    S1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[9]);
    S0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[8]);
    DEC_ROUND(Q0, Q1, Q2, Q3, 15); DEC_ROUND(R0, R1, R2, R3, 15); DEC_ROUND(S0, S1, S2, S3, 15);
    DEC_ROUND(Q2, Q3, Q0, Q1, 14); DEC_ROUND(R2, R3, R0, R1, 14); DEC_ROUND(S2, S3, S0, S1, 14);
    DEC_ROUND(Q0, Q1, Q2, Q3, 13); DEC_ROUND(R0, R1, R2, R3, 13); DEC_ROUND(S0, S1, S2, S3, 13);
    DEC_ROUND(Q2, Q3, Q0, Q1, 12); DEC_ROUND(R2, R3, R0, R1, 12); DEC_ROUND(S2, S3, S0, S1, 12);
    DEC_ROUND(Q0, Q1, Q2, Q3, 11); DEC_ROUND(R0, R1, R2, R3, 11); DEC_ROUND(S0, S1, S2, S3, 11);
    DEC_ROUND(Q2, Q3, Q0, Q1, 10); DEC_ROUND(R2, R3, R0, R1, 10); DEC_ROUND(S2, S3, S0, S1, 10);
    DEC_ROUND(Q0, Q1, Q2, Q3,  9); DEC_ROUND(R0, R1, R2, R3,  9); DEC_ROUND(S0, S1, S2, S3,  9);
    DEC_ROUND(Q2, Q3, Q0, Q1,  8); DEC_ROUND(R2, R3, R0, R1,  8); DEC_ROUND(S2, S3, S0, S1,  8);
    DEC_ROUND(Q0, Q1, Q2, Q3,  7); DEC_ROUND(R0, R1, R2, R3,  7); DEC_ROUND(S0, S1, S2, S3,  7);
    DEC_ROUND(Q2, Q3, Q0, Q1,  6); DEC_ROUND(R2, R3, R0, R1,  6); DEC_ROUND(S2, S3, S0, S1,  6);
    DEC_ROUND(Q0, Q1, Q2, Q3,  5); DEC_ROUND(R0, R1, R2, R3,  5); DEC_ROUND(S0, S1, S2, S3,  5);
    DEC_ROUND(Q2, Q3, Q0, Q1,  4); DEC_ROUND(R2, R3, R0, R1,  4); DEC_ROUND(S2, S3, S0, S1,  4);
    DEC_ROUND(Q0, Q1, Q2, Q3,  3); DEC_ROUND(R0, R1, R2, R3,  3); DEC_ROUND(S0, S1, S2, S3,  3);
    DEC_ROUND(Q2, Q3, Q0, Q1,  2); DEC_ROUND(R2, R3, R0, R1,  2); DEC_ROUND(S2, S3, S0, S1,  2);
    DEC_ROUND(Q0, Q1, Q2, Q3,  1); DEC_ROUND(R0, R1, R2, R3,  1); DEC_ROUND(S0, S1, S2, S3,  1);
    DEC_ROUND(Q2, Q3, Q0, Q1,  0); DEC_ROUND(R2, R3, R0, R1,  0); DEC_ROUND(S2, S3, S0, S1,  0);
    /* load/byteswap/whiten output/iv */
    ((uint32_t*)out)[11] = htole32(S1 ^ ctx->K[3] ^ ((uint32_t*)in)[7]);
    ((uint32_t*)out)[10] = htole32(S0 ^ ctx->K[2] ^ ((uint32_t*)in)[6]);
    ((uint32_t*)out)[9]  = htole32(S3 ^ ctx->K[1] ^ ((uint32_t*)in)[5]);
    ((uint32_t*)out)[8]  = htole32(S2 ^ ctx->K[0] ^ ((uint32_t*)in)[4]);
    ((uint32_t*)out)[7]  = htole32(R1 ^ ctx->K[3] ^ ((uint32_t*)in)[3]);
    ((uint32_t*)out)[6]  = htole32(R0 ^ ctx->K[2] ^ ((uint32_t*)in)[2]);
    ((uint32_t*)out)[5]  = htole32(R3 ^ ctx->K[1] ^ ((uint32_t*)in)[1]);
    ((uint32_t*)out)[4]  = htole32(R2 ^ ctx->K[0] ^ ((uint32_t*)in)[0]);
    ((uint32_t*)out)[3]  = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
    ((uint32_t*)out)[2]  = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
    ((uint32_t*)out)[1]  = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
    ((uint32_t*)out)[0]  = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);
    in += 3 * TF_BLOCK_SIZE; out += 3 * TF_BLOCK_SIZE;
    memcpy(ivec, old, TF_BLOCK_SIZE);
  }
  for(; n != 0; n--) {
    uint32_t T0, T1;
    uint32_t Q0, Q1, Q2, Q3;
    memcpy (old, in, TF_BLOCK_SIZE);
    /* load/byteswap/whiten input */
    Q3 = ctx->K[7] ^ le32toh(((uint32_t*)in)[3]);
    Q2 = ctx->K[6] ^ le32toh(((uint32_t*)in)[2]);
    Q1 = ctx->K[5] ^ le32toh(((uint32_t*)in)[1]);
    Q0 = ctx->K[4] ^ le32toh(((uint32_t*)in)[0]);
    DEC_ROUND(Q0, Q1, Q2, Q3, 15);
    DEC_ROUND(Q2, Q3, Q0, Q1, 14);
    DEC_ROUND(Q0, Q1, Q2, Q3, 13);
    DEC_ROUND(Q2, Q3, Q0, Q1, 12);
    DEC_ROUND(Q0, Q1, Q2, Q3, 11);
    DEC_ROUND(Q2, Q3, Q0, Q1, 10);
    DEC_ROUND(Q0, Q1, Q2, Q3,  9);
    DEC_ROUND(Q2, Q3, Q0, Q1,  8);
    DEC_ROUND(Q0, Q1, Q2, Q3,  7);
    DEC_ROUND(Q2, Q3, Q0, Q1,  6);
    DEC_ROUND(Q0, Q1, Q2, Q3,  5);
    DEC_ROUND(Q2, Q3, Q0, Q1,  4);
    DEC_ROUND(Q0, Q1, Q2, Q3,  3);
    DEC_ROUND(Q2, Q3, Q0, Q1,  2);
    DEC_ROUND(Q0, Q1, Q2, Q3,  1);
    DEC_ROUND(Q2, Q3, Q0, Q1,  0);
    /* load/byteswap/whiten output/iv */
    ((uint32_t*)out)[3] = htole32(Q1 ^ ctx->K[3] ^ ((uint32_t*)ivec)[3]);
    ((uint32_t*)out)[2] = htole32(Q0 ^ ctx->K[2] ^ ((uint32_t*)ivec)[2]);
    ((uint32_t*)out)[1] = htole32(Q3 ^ ctx->K[1] ^ ((uint32_t*)ivec)[1]);
    ((uint32_t*)out)[0] = htole32(Q2 ^ ctx->K[0] ^ ((uint32_t*)ivec)[0]);
    in += TF_BLOCK_SIZE; out+= TF_BLOCK_SIZE;
    memcpy (ivec, old, TF_BLOCK_SIZE);
  }
  return n * TF_BLOCK_SIZE;
 }
-/**
+
- * By definition twofish can only accept key up to 256 bit
+// by definition twofish can only accept key up to 256 bit
- * we wont do any checking here and will assume user already
+// we wont do any checking here and will assume user already
- * know about it. Twofish is undefined for key larger than 256 bit
+// know about it. twofish is undefined for key larger than 256 bit
 */
 int tf_init (const unsigned char *key, size_t key_size, tf_context_t **ctx) {
-  int k;
+    int k;
-  uint32_t *S;
+    uint32_t *S;
    *ctx = calloc(1, sizeof(tf_context_t));
    if(!(*ctx)) {
        return -1;
    }
-  *ctx = calloc(1, sizeof(tf_context_t));
+    (*ctx)->N = key_size;
-  if(!(*ctx)) {
+    keySched(key, key_size, &S, (*ctx)->K, &k);
-    return -1;
+    fullKey(S, k, (*ctx)->QF);
-  }
+    free(S); /* allocated in keySched(...) */
  (*ctx)->N = key_size;
  keySched(key, key_size, &S, (*ctx)->K, &k);
  fullKey(S, k, (*ctx)->QF);
  free(S);    // allocated in keySched(...)
-  return 0;
+    return 0;
 }
 int tf_deinit (tf_context_t *ctx) {
-  if (ctx) free (ctx);
+    if(ctx) free(ctx);
-  return 0;
+    return 0;
 }