Botan
1.11.15
|
00001 /* 00002 * Threefish-512 using AVX2 00003 * (C) 2013 Jack Lloyd 00004 * 00005 * Botan is released under the Simplified BSD License (see license.txt) 00006 */ 00007 00008 #include <botan/internal/block_utils.h> 00009 #include <botan/threefish_avx2.h> 00010 #include <botan/cpuid.h> 00011 #include <immintrin.h> 00012 00013 namespace Botan { 00014 00015 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_avx2(), Threefish_512_AVX2, "Threefish-512", "avx2", 64); 00016 00017 namespace { 00018 00019 inline void interleave_epi64(__m256i& X0, __m256i& X1) 00020 { 00021 // interleave X0 and X1 qwords 00022 // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7) 00023 00024 const __m256i T0 = _mm256_unpacklo_epi64(X0, X1); 00025 const __m256i T1 = _mm256_unpackhi_epi64(X0, X1); 00026 00027 X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0)); 00028 X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0)); 00029 } 00030 00031 inline void deinterleave_epi64(__m256i& X0, __m256i& X1) 00032 { 00033 const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0)); 00034 const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0)); 00035 00036 X0 = _mm256_unpacklo_epi64(T0, T1); 00037 X1 = _mm256_unpackhi_epi64(T0, T1); 00038 } 00039 00040 } 00041 00042 void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const 00043 { 00044 const u64bit* K = &get_K()[0]; 00045 const u64bit* T_64 = &get_T()[0]; 00046 00047 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); 00048 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); 00049 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); 00050 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); 00051 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); 00052 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); 00053 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); 00054 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); 00055 00056 #define THREEFISH_ROUND(X0, X1, SHL) \ 00057 do { \ 00058 const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ 00059 X0 = _mm256_add_epi64(X0, X1); \ 00060 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ 00061 X1 = _mm256_xor_si256(X1, X0); \ 00062 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ 00063 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ 00064 } while(0) 00065 00066 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \ 00067 do { \ 00068 const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ 00069 X0 = _mm256_add_epi64(X0, X1); \ 00070 X2 = _mm256_add_epi64(X2, X3); \ 00071 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ 00072 X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ 00073 X1 = _mm256_xor_si256(X1, X0); \ 00074 X3 = _mm256_xor_si256(X3, X2); \ 00075 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ 00076 X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \ 00077 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ 00078 X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ 00079 } while(0) 00080 00081 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ 00082 do { \ 00083 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ 00084 const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ 00085 X0 = _mm256_add_epi64(X0, K0); \ 00086 X1 = _mm256_add_epi64(X1, K1); \ 00087 X1 = _mm256_add_epi64(X1, R); \ 00088 X0 = _mm256_add_epi64(X0, T0); \ 00089 X1 = _mm256_add_epi64(X1, T1); \ 00090 R = _mm256_add_epi64(R, ONE); \ 00091 } while(0) 00092 00093 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ 00094 do { \ 00095 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ 00096 __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ 00097 X0 = _mm256_add_epi64(X0, K0); \ 00098 X2 = _mm256_add_epi64(X2, K0); \ 00099 X1 = _mm256_add_epi64(X1, K1); \ 00100 X3 = _mm256_add_epi64(X3, K1); \ 00101 T1 = _mm256_add_epi64(T1, R); \ 00102 X0 = _mm256_add_epi64(X0, T0); \ 00103 X2 = _mm256_add_epi64(X2, T0); \ 00104 X1 = _mm256_add_epi64(X1, T1); \ 00105 X3 = _mm256_add_epi64(X3, T1); \ 00106 R = _mm256_add_epi64(R, ONE); \ 00107 } while(0) 00108 00109 #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ 00110 do { \ 00111 THREEFISH_ROUND(X0, X1, ROTATE_1); \ 00112 THREEFISH_ROUND(X0, X1, ROTATE_2); \ 00113 THREEFISH_ROUND(X0, X1, ROTATE_3); \ 00114 THREEFISH_ROUND(X0, X1, ROTATE_4); \ 00115 THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ 00116 \ 00117 THREEFISH_ROUND(X0, X1, ROTATE_5); \ 00118 THREEFISH_ROUND(X0, X1, ROTATE_6); \ 00119 THREEFISH_ROUND(X0, X1, ROTATE_7); \ 00120 THREEFISH_ROUND(X0, X1, ROTATE_8); \ 00121 THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \ 00122 } while(0) 00123 00124 #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \ 00125 do { \ 00126 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ 00127 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ 00128 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ 00129 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ 00130 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \ 00131 \ 00132 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ 00133 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ 00134 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ 00135 THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ 00136 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0); \ 00137 } while(0) 00138 00139 /* 00140 v1.0 key schedule: 9 ymm registers (only need 2 or 3) 00141 (0,1,2,3),(4,5,6,7) [8] 00142 then mutating with vpermq 00143 */ 00144 const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); 00145 const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); 00146 const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); 00147 const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); 00148 const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); 00149 const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); 00150 const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); 00151 const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); 00152 const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); 00153 00154 const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); 00155 00156 const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); 00157 __m256i* out_mm = reinterpret_cast<__m256i*>(out); 00158 00159 while(blocks >= 2) 00160 { 00161 __m256i X0 = _mm256_loadu_si256(in_mm++); 00162 __m256i X1 = _mm256_loadu_si256(in_mm++); 00163 __m256i X2 = _mm256_loadu_si256(in_mm++); 00164 __m256i X3 = _mm256_loadu_si256(in_mm++); 00165 00166 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); 00167 00168 __m256i R = _mm256_set_epi64x(0, 0, 0, 0); 00169 00170 interleave_epi64(X0, X1); 00171 interleave_epi64(X2, X3); 00172 00173 THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3); 00174 00175 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3); 00176 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1); 00177 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2); 00178 00179 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3); 00180 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1); 00181 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2); 00182 00183 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3); 00184 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1); 00185 THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2); 00186 00187 deinterleave_epi64(X0, X1); 00188 deinterleave_epi64(X2, X3); 00189 00190 _mm256_storeu_si256(out_mm++, X0); 00191 _mm256_storeu_si256(out_mm++, X1); 00192 _mm256_storeu_si256(out_mm++, X2); 00193 _mm256_storeu_si256(out_mm++, X3); 00194 00195 blocks -= 2; 00196 } 00197 00198 for(size_t i = 0; i != blocks; ++i) 00199 { 00200 __m256i X0 = _mm256_loadu_si256(in_mm++); 00201 __m256i X1 = _mm256_loadu_si256(in_mm++); 00202 00203 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); 00204 00205 __m256i R = _mm256_set_epi64x(0, 0, 0, 0); 00206 00207 interleave_epi64(X0, X1); 00208 00209 THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3); 00210 00211 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3); 00212 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1); 00213 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2); 00214 00215 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3); 00216 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1); 00217 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2); 00218 00219 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3); 00220 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1); 00221 THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2); 00222 00223 deinterleave_epi64(X0, X1); 00224 00225 _mm256_storeu_si256(out_mm++, X0); 00226 _mm256_storeu_si256(out_mm++, X1); 00227 } 00228 00229 #undef THREEFISH_ENC_8_ROUNDS 00230 #undef THREEFISH_ROUND 00231 #undef THREEFISH_INJECT_KEY 00232 #undef THREEFISH_ENC_2_8_ROUNDS 00233 #undef THREEFISH_ROUND_2 00234 #undef THREEFISH_INJECT_KEY_2 00235 } 00236 00237 void Threefish_512_AVX2::decrypt_n(const byte in[], byte out[], size_t blocks) const 00238 { 00239 const u64bit* K = &get_K()[0]; 00240 const u64bit* T_64 = &get_T()[0]; 00241 00242 const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); 00243 const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); 00244 const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); 00245 const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); 00246 const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); 00247 const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); 00248 const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); 00249 const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); 00250 00251 #define THREEFISH_ROUND(X0, X1, SHR) \ 00252 do { \ 00253 const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \ 00254 X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \ 00255 X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ 00256 X1 = _mm256_xor_si256(X1, X0); \ 00257 X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ 00258 X0 = _mm256_sub_epi64(X0, X1); \ 00259 } while(0) 00260 00261 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ 00262 do { \ 00263 const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ 00264 const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ 00265 X0 = _mm256_sub_epi64(X0, K0); \ 00266 X1 = _mm256_sub_epi64(X1, K1); \ 00267 X1 = _mm256_sub_epi64(X1, R); \ 00268 R = _mm256_sub_epi64(R, ONE); \ 00269 X0 = _mm256_sub_epi64(X0, T0); \ 00270 X1 = _mm256_sub_epi64(X1, T1); \ 00271 } while(0) 00272 00273 #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ 00274 do { \ 00275 THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \ 00276 THREEFISH_ROUND(X0, X1, ROTATE_8); \ 00277 THREEFISH_ROUND(X0, X1, ROTATE_7); \ 00278 THREEFISH_ROUND(X0, X1, ROTATE_6); \ 00279 THREEFISH_ROUND(X0, X1, ROTATE_5); \ 00280 \ 00281 THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ 00282 THREEFISH_ROUND(X0, X1, ROTATE_4); \ 00283 THREEFISH_ROUND(X0, X1, ROTATE_3); \ 00284 THREEFISH_ROUND(X0, X1, ROTATE_2); \ 00285 THREEFISH_ROUND(X0, X1, ROTATE_1); \ 00286 } while(0) 00287 00288 /* 00289 v1.0 key schedule: 9 ymm registers (only need 2 or 3) 00290 (0,1,2,3),(4,5,6,7) [8] 00291 then mutating with vpermq 00292 */ 00293 const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); 00294 const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); 00295 const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); 00296 const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); 00297 const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); 00298 const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); 00299 const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); 00300 const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); 00301 const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); 00302 00303 const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); 00304 00305 const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); 00306 __m256i* out_mm = reinterpret_cast<__m256i*>(out); 00307 00308 for(size_t i = 0; i != blocks; ++i) 00309 { 00310 __m256i X0 = _mm256_loadu_si256(in_mm++); 00311 __m256i X1 = _mm256_loadu_si256(in_mm++); 00312 00313 const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); 00314 00315 __m256i R = _mm256_set_epi64x(18, 0, 0, 0); 00316 00317 interleave_epi64(X0, X1); 00318 00319 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2); 00320 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1); 00321 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3); 00322 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2); 00323 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1); 00324 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3); 00325 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2); 00326 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1); 00327 THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3); 00328 00329 THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3); 00330 00331 deinterleave_epi64(X0, X1); 00332 00333 _mm256_storeu_si256(out_mm++, X0); 00334 _mm256_storeu_si256(out_mm++, X1); 00335 } 00336 00337 #undef THREEFISH_DEC_8_ROUNDS 00338 #undef THREEFISH_ROUND 00339 #undef THREEFISH_INJECT_KEY 00340 #undef THREEFISH_DEC_2_8_ROUNDS 00341 #undef THREEFISH_ROUND_2 00342 #undef THREEFISH_INJECT_KEY_2 00343 } 00344 00345 }