Botan  1.11.15
src/lib/block/threefish_avx2/threefish_avx2.cpp
Go to the documentation of this file.
00001 /*
00002 * Threefish-512 using AVX2
00003 * (C) 2013 Jack Lloyd
00004 *
00005 * Botan is released under the Simplified BSD License (see license.txt)
00006 */
00007 
00008 #include <botan/internal/block_utils.h>
00009 #include <botan/threefish_avx2.h>
00010 #include <botan/cpuid.h>
00011 #include <immintrin.h>
00012 
00013 namespace Botan {
00014 
00015 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_avx2(), Threefish_512_AVX2, "Threefish-512", "avx2", 64);
00016 
00017 namespace {
00018 
00019 inline void interleave_epi64(__m256i& X0, __m256i& X1)
00020    {
00021    // interleave X0 and X1 qwords
00022    // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
00023 
00024    const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
00025    const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
00026 
00027    X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
00028    X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
00029    }
00030 
00031 inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
00032    {
00033    const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
00034    const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
00035 
00036    X0 = _mm256_unpacklo_epi64(T0, T1);
00037    X1 = _mm256_unpackhi_epi64(T0, T1);
00038    }
00039 
00040 }
00041 
00042 void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const
00043    {
00044    const u64bit* K = &get_K()[0];
00045    const u64bit* T_64 = &get_T()[0];
00046 
00047    const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
00048    const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
00049    const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
00050    const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
00051    const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
00052    const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
00053    const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
00054    const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
00055 
00056 #define THREEFISH_ROUND(X0, X1, SHL)                                                \
00057    do {                                                                             \
00058       const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
00059       X0 = _mm256_add_epi64(X0, X1);                                                \
00060       X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
00061       X1 = _mm256_xor_si256(X1, X0);                                                \
00062       X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
00063       X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
00064    } while(0)
00065 
00066 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
00067    do {                                                                             \
00068       const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
00069       X0 = _mm256_add_epi64(X0, X1);                                                \
00070       X2 = _mm256_add_epi64(X2, X3);                                                \
00071       X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
00072       X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
00073       X1 = _mm256_xor_si256(X1, X0);                                                \
00074       X3 = _mm256_xor_si256(X3, X2);                                                \
00075       X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
00076       X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
00077       X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
00078       X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
00079    } while(0)
00080 
00081 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
00082    do {                                                                          \
00083       const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
00084       const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
00085       X0 = _mm256_add_epi64(X0, K0);                                             \
00086       X1 = _mm256_add_epi64(X1, K1);                                             \
00087       X1 = _mm256_add_epi64(X1, R);                                              \
00088       X0 = _mm256_add_epi64(X0, T0);                                             \
00089       X1 = _mm256_add_epi64(X1, T1);                                             \
00090       R = _mm256_add_epi64(R, ONE);                                              \
00091    } while(0)
00092 
00093 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
00094    do {                                                                          \
00095       const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
00096       __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
00097       X0 = _mm256_add_epi64(X0, K0);                                             \
00098       X2 = _mm256_add_epi64(X2, K0);                                             \
00099       X1 = _mm256_add_epi64(X1, K1);                                             \
00100       X3 = _mm256_add_epi64(X3, K1);                                             \
00101       T1 = _mm256_add_epi64(T1, R);                                              \
00102       X0 = _mm256_add_epi64(X0, T0);                                             \
00103       X2 = _mm256_add_epi64(X2, T0);                                             \
00104       X1 = _mm256_add_epi64(X1, T1);                                             \
00105       X3 = _mm256_add_epi64(X3, T1);                                             \
00106       R = _mm256_add_epi64(R, ONE);                                              \
00107    } while(0)
00108 
00109 #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)        \
00110    do {                                                        \
00111       THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
00112       THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
00113       THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
00114       THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
00115       THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);         \
00116                                                                \
00117       THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
00118       THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
00119       THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
00120       THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
00121       THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0);         \
00122    } while(0)
00123 
00124 #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
00125    do {                                                                  \
00126       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
00127       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
00128       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
00129       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
00130       THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);         \
00131                                                                          \
00132       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
00133       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
00134       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
00135       THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
00136       THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0);         \
00137    } while(0)
00138 
00139    /*
00140    v1.0 key schedule: 9 ymm registers (only need 2 or 3)
00141    (0,1,2,3),(4,5,6,7) [8]
00142    then mutating with vpermq
00143    */
00144    const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
00145    const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
00146    const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
00147    const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
00148    const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
00149    const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
00150    const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
00151    const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
00152    const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
00153 
00154    const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
00155 
00156    const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
00157    __m256i* out_mm = reinterpret_cast<__m256i*>(out);
00158 
00159    while(blocks >= 2)
00160       {
00161       __m256i X0 = _mm256_loadu_si256(in_mm++);
00162       __m256i X1 = _mm256_loadu_si256(in_mm++);
00163       __m256i X2 = _mm256_loadu_si256(in_mm++);
00164       __m256i X3 = _mm256_loadu_si256(in_mm++);
00165 
00166       const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
00167 
00168       __m256i R = _mm256_set_epi64x(0, 0, 0, 0);
00169 
00170       interleave_epi64(X0, X1);
00171       interleave_epi64(X2, X3);
00172 
00173       THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3);
00174 
00175       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3);
00176       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1);
00177       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2);
00178 
00179       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3);
00180       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1);
00181       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2);
00182 
00183       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3);
00184       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1);
00185       THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2);
00186 
00187       deinterleave_epi64(X0, X1);
00188       deinterleave_epi64(X2, X3);
00189 
00190       _mm256_storeu_si256(out_mm++, X0);
00191       _mm256_storeu_si256(out_mm++, X1);
00192       _mm256_storeu_si256(out_mm++, X2);
00193       _mm256_storeu_si256(out_mm++, X3);
00194 
00195       blocks -= 2;
00196       }
00197 
00198    for(size_t i = 0; i != blocks; ++i)
00199       {
00200       __m256i X0 = _mm256_loadu_si256(in_mm++);
00201       __m256i X1 = _mm256_loadu_si256(in_mm++);
00202 
00203       const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
00204 
00205       __m256i R = _mm256_set_epi64x(0, 0, 0, 0);
00206 
00207       interleave_epi64(X0, X1);
00208 
00209       THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3);
00210 
00211       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3);
00212       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1);
00213       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2);
00214 
00215       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3);
00216       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1);
00217       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2);
00218 
00219       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3);
00220       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1);
00221       THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2);
00222 
00223       deinterleave_epi64(X0, X1);
00224 
00225       _mm256_storeu_si256(out_mm++, X0);
00226       _mm256_storeu_si256(out_mm++, X1);
00227       }
00228 
00229 #undef THREEFISH_ENC_8_ROUNDS
00230 #undef THREEFISH_ROUND
00231 #undef THREEFISH_INJECT_KEY
00232 #undef THREEFISH_ENC_2_8_ROUNDS
00233 #undef THREEFISH_ROUND_2
00234 #undef THREEFISH_INJECT_KEY_2
00235    }
00236 
00237 void Threefish_512_AVX2::decrypt_n(const byte in[], byte out[], size_t blocks) const
00238    {
00239    const u64bit* K = &get_K()[0];
00240    const u64bit* T_64 = &get_T()[0];
00241 
00242    const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
00243    const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
00244    const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
00245    const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
00246    const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
00247    const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
00248    const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
00249    const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
00250 
00251 #define THREEFISH_ROUND(X0, X1, SHR)                                                \
00252    do {                                                                             \
00253       const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR);            \
00254       X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3));                   \
00255       X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
00256       X1 = _mm256_xor_si256(X1, X0);                                                \
00257       X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
00258       X0 = _mm256_sub_epi64(X0, X1);                                                \
00259    } while(0)
00260 
00261 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                \
00262    do {                                                                          \
00263       const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
00264       const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
00265       X0 = _mm256_sub_epi64(X0, K0);                                             \
00266       X1 = _mm256_sub_epi64(X1, K1);                                             \
00267       X1 = _mm256_sub_epi64(X1, R);                                              \
00268       R = _mm256_sub_epi64(R, ONE);                                              \
00269       X0 = _mm256_sub_epi64(X0, T0);                                             \
00270       X1 = _mm256_sub_epi64(X1, T1);                                             \
00271    } while(0)
00272 
00273 #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)   \
00274    do {                                                      \
00275       THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0);       \
00276       THREEFISH_ROUND(X0, X1, ROTATE_8);                     \
00277       THREEFISH_ROUND(X0, X1, ROTATE_7);                     \
00278       THREEFISH_ROUND(X0, X1, ROTATE_6);                     \
00279       THREEFISH_ROUND(X0, X1, ROTATE_5);                     \
00280                                                              \
00281       THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);       \
00282       THREEFISH_ROUND(X0, X1, ROTATE_4);                     \
00283       THREEFISH_ROUND(X0, X1, ROTATE_3);                     \
00284       THREEFISH_ROUND(X0, X1, ROTATE_2);                     \
00285       THREEFISH_ROUND(X0, X1, ROTATE_1);                     \
00286    } while(0)
00287 
00288    /*
00289    v1.0 key schedule: 9 ymm registers (only need 2 or 3)
00290    (0,1,2,3),(4,5,6,7) [8]
00291    then mutating with vpermq
00292    */
00293    const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
00294    const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
00295    const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
00296    const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
00297    const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
00298    const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
00299    const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
00300    const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
00301    const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
00302 
00303    const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);
00304 
00305    const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
00306    __m256i* out_mm = reinterpret_cast<__m256i*>(out);
00307 
00308    for(size_t i = 0; i != blocks; ++i)
00309       {
00310       __m256i X0 = _mm256_loadu_si256(in_mm++);
00311       __m256i X1 = _mm256_loadu_si256(in_mm++);
00312 
00313       const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
00314 
00315       __m256i R = _mm256_set_epi64x(18, 0, 0, 0);
00316 
00317       interleave_epi64(X0, X1);
00318 
00319       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2);
00320       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1);
00321       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3);
00322       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2);
00323       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1);
00324       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3);
00325       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2);
00326       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1);
00327       THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3);
00328 
00329       THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3);
00330 
00331       deinterleave_epi64(X0, X1);
00332 
00333       _mm256_storeu_si256(out_mm++, X0);
00334       _mm256_storeu_si256(out_mm++, X1);
00335       }
00336 
00337 #undef THREEFISH_DEC_8_ROUNDS
00338 #undef THREEFISH_ROUND
00339 #undef THREEFISH_INJECT_KEY
00340 #undef THREEFISH_DEC_2_8_ROUNDS
00341 #undef THREEFISH_ROUND_2
00342 #undef THREEFISH_INJECT_KEY_2
00343    }
00344 
00345 }