Botan
1.11.15
|
00001 /* 00002 * AES using AES-NI instructions 00003 * (C) 2009,2012 Jack Lloyd 00004 * 00005 * Botan is released under the Simplified BSD License (see license.txt) 00006 */ 00007 00008 #include <botan/internal/block_utils.h> 00009 #include <botan/aes_ni.h> 00010 #include <botan/cpuid.h> 00011 #include <wmmintrin.h> 00012 00013 namespace Botan { 00014 00015 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_aes_ni(), AES_128_NI, "AES-128", "aes_ni", 16); 00016 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_aes_ni(), AES_192_NI, "AES-192", "aes_ni", 16); 00017 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_aes_ni(), AES_256_NI, "AES-256", "aes_ni", 16); 00018 00019 namespace { 00020 00021 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) 00022 { 00023 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3)); 00024 key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); 00025 key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); 00026 key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); 00027 return _mm_xor_si128(key, key_with_rcon); 00028 } 00029 00030 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, 00031 u32bit out[], bool last) 00032 { 00033 __m128i key1 = *K1; 00034 __m128i key2 = *K2; 00035 00036 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); 00037 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); 00038 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); 00039 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); 00040 key1 = _mm_xor_si128(key1, key2_with_rcon); 00041 00042 *K1 = key1; 00043 _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); 00044 00045 if(last) 00046 return; 00047 00048 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); 00049 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); 00050 00051 *K2 = key2; 00052 out[4] = _mm_cvtsi128_si32(key2); 00053 out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); 00054 } 00055 00056 /* 00057 * The second half of the AES-256 key expansion (other half same as AES-128) 00058 */ 00059 __m128i aes_256_key_expansion(__m128i key, __m128i key2) 00060 { 00061 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); 00062 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2)); 00063 00064 key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); 00065 key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); 00066 key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); 00067 return _mm_xor_si128(key, key_with_rcon); 00068 } 00069 00070 } 00071 00072 #define AES_ENC_4_ROUNDS(K) \ 00073 do \ 00074 { \ 00075 B0 = _mm_aesenc_si128(B0, K); \ 00076 B1 = _mm_aesenc_si128(B1, K); \ 00077 B2 = _mm_aesenc_si128(B2, K); \ 00078 B3 = _mm_aesenc_si128(B3, K); \ 00079 } while(0) 00080 00081 #define AES_ENC_4_LAST_ROUNDS(K) \ 00082 do \ 00083 { \ 00084 B0 = _mm_aesenclast_si128(B0, K); \ 00085 B1 = _mm_aesenclast_si128(B1, K); \ 00086 B2 = _mm_aesenclast_si128(B2, K); \ 00087 B3 = _mm_aesenclast_si128(B3, K); \ 00088 } while(0) 00089 00090 #define AES_DEC_4_ROUNDS(K) \ 00091 do \ 00092 { \ 00093 B0 = _mm_aesdec_si128(B0, K); \ 00094 B1 = _mm_aesdec_si128(B1, K); \ 00095 B2 = _mm_aesdec_si128(B2, K); \ 00096 B3 = _mm_aesdec_si128(B3, K); \ 00097 } while(0) 00098 00099 #define AES_DEC_4_LAST_ROUNDS(K) \ 00100 do \ 00101 { \ 00102 B0 = _mm_aesdeclast_si128(B0, K); \ 00103 B1 = _mm_aesdeclast_si128(B1, K); \ 00104 B2 = _mm_aesdeclast_si128(B2, K); \ 00105 B3 = _mm_aesdeclast_si128(B3, K); \ 00106 } while(0) 00107 00108 /* 00109 * AES-128 Encryption 00110 */ 00111 void AES_128_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const 00112 { 00113 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00114 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00115 00116 const __m128i* key_mm = reinterpret_cast<const __m128i*>(&EK[0]); 00117 00118 __m128i K0 = _mm_loadu_si128(key_mm); 00119 __m128i K1 = _mm_loadu_si128(key_mm + 1); 00120 __m128i K2 = _mm_loadu_si128(key_mm + 2); 00121 __m128i K3 = _mm_loadu_si128(key_mm + 3); 00122 __m128i K4 = _mm_loadu_si128(key_mm + 4); 00123 __m128i K5 = _mm_loadu_si128(key_mm + 5); 00124 __m128i K6 = _mm_loadu_si128(key_mm + 6); 00125 __m128i K7 = _mm_loadu_si128(key_mm + 7); 00126 __m128i K8 = _mm_loadu_si128(key_mm + 8); 00127 __m128i K9 = _mm_loadu_si128(key_mm + 9); 00128 __m128i K10 = _mm_loadu_si128(key_mm + 10); 00129 00130 while(blocks >= 4) 00131 { 00132 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00133 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00134 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00135 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00136 00137 B0 = _mm_xor_si128(B0, K0); 00138 B1 = _mm_xor_si128(B1, K0); 00139 B2 = _mm_xor_si128(B2, K0); 00140 B3 = _mm_xor_si128(B3, K0); 00141 00142 AES_ENC_4_ROUNDS(K1); 00143 AES_ENC_4_ROUNDS(K2); 00144 AES_ENC_4_ROUNDS(K3); 00145 AES_ENC_4_ROUNDS(K4); 00146 AES_ENC_4_ROUNDS(K5); 00147 AES_ENC_4_ROUNDS(K6); 00148 AES_ENC_4_ROUNDS(K7); 00149 AES_ENC_4_ROUNDS(K8); 00150 AES_ENC_4_ROUNDS(K9); 00151 AES_ENC_4_LAST_ROUNDS(K10); 00152 00153 _mm_storeu_si128(out_mm + 0, B0); 00154 _mm_storeu_si128(out_mm + 1, B1); 00155 _mm_storeu_si128(out_mm + 2, B2); 00156 _mm_storeu_si128(out_mm + 3, B3); 00157 00158 blocks -= 4; 00159 in_mm += 4; 00160 out_mm += 4; 00161 } 00162 00163 for(size_t i = 0; i != blocks; ++i) 00164 { 00165 __m128i B = _mm_loadu_si128(in_mm + i); 00166 00167 B = _mm_xor_si128(B, K0); 00168 00169 B = _mm_aesenc_si128(B, K1); 00170 B = _mm_aesenc_si128(B, K2); 00171 B = _mm_aesenc_si128(B, K3); 00172 B = _mm_aesenc_si128(B, K4); 00173 B = _mm_aesenc_si128(B, K5); 00174 B = _mm_aesenc_si128(B, K6); 00175 B = _mm_aesenc_si128(B, K7); 00176 B = _mm_aesenc_si128(B, K8); 00177 B = _mm_aesenc_si128(B, K9); 00178 B = _mm_aesenclast_si128(B, K10); 00179 00180 _mm_storeu_si128(out_mm + i, B); 00181 } 00182 } 00183 00184 /* 00185 * AES-128 Decryption 00186 */ 00187 void AES_128_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const 00188 { 00189 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00190 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00191 00192 const __m128i* key_mm = reinterpret_cast<const __m128i*>(&DK[0]); 00193 00194 __m128i K0 = _mm_loadu_si128(key_mm); 00195 __m128i K1 = _mm_loadu_si128(key_mm + 1); 00196 __m128i K2 = _mm_loadu_si128(key_mm + 2); 00197 __m128i K3 = _mm_loadu_si128(key_mm + 3); 00198 __m128i K4 = _mm_loadu_si128(key_mm + 4); 00199 __m128i K5 = _mm_loadu_si128(key_mm + 5); 00200 __m128i K6 = _mm_loadu_si128(key_mm + 6); 00201 __m128i K7 = _mm_loadu_si128(key_mm + 7); 00202 __m128i K8 = _mm_loadu_si128(key_mm + 8); 00203 __m128i K9 = _mm_loadu_si128(key_mm + 9); 00204 __m128i K10 = _mm_loadu_si128(key_mm + 10); 00205 00206 while(blocks >= 4) 00207 { 00208 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00209 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00210 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00211 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00212 00213 B0 = _mm_xor_si128(B0, K0); 00214 B1 = _mm_xor_si128(B1, K0); 00215 B2 = _mm_xor_si128(B2, K0); 00216 B3 = _mm_xor_si128(B3, K0); 00217 00218 AES_DEC_4_ROUNDS(K1); 00219 AES_DEC_4_ROUNDS(K2); 00220 AES_DEC_4_ROUNDS(K3); 00221 AES_DEC_4_ROUNDS(K4); 00222 AES_DEC_4_ROUNDS(K5); 00223 AES_DEC_4_ROUNDS(K6); 00224 AES_DEC_4_ROUNDS(K7); 00225 AES_DEC_4_ROUNDS(K8); 00226 AES_DEC_4_ROUNDS(K9); 00227 AES_DEC_4_LAST_ROUNDS(K10); 00228 00229 _mm_storeu_si128(out_mm + 0, B0); 00230 _mm_storeu_si128(out_mm + 1, B1); 00231 _mm_storeu_si128(out_mm + 2, B2); 00232 _mm_storeu_si128(out_mm + 3, B3); 00233 00234 blocks -= 4; 00235 in_mm += 4; 00236 out_mm += 4; 00237 } 00238 00239 for(size_t i = 0; i != blocks; ++i) 00240 { 00241 __m128i B = _mm_loadu_si128(in_mm + i); 00242 00243 B = _mm_xor_si128(B, K0); 00244 00245 B = _mm_aesdec_si128(B, K1); 00246 B = _mm_aesdec_si128(B, K2); 00247 B = _mm_aesdec_si128(B, K3); 00248 B = _mm_aesdec_si128(B, K4); 00249 B = _mm_aesdec_si128(B, K5); 00250 B = _mm_aesdec_si128(B, K6); 00251 B = _mm_aesdec_si128(B, K7); 00252 B = _mm_aesdec_si128(B, K8); 00253 B = _mm_aesdec_si128(B, K9); 00254 B = _mm_aesdeclast_si128(B, K10); 00255 00256 _mm_storeu_si128(out_mm + i, B); 00257 } 00258 } 00259 00260 /* 00261 * AES-128 Key Schedule 00262 */ 00263 void AES_128_NI::key_schedule(const byte key[], size_t) 00264 { 00265 EK.resize(44); 00266 DK.resize(44); 00267 00268 #define AES_128_key_exp(K, RCON) \ 00269 aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)) 00270 00271 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); 00272 __m128i K1 = AES_128_key_exp(K0, 0x01); 00273 __m128i K2 = AES_128_key_exp(K1, 0x02); 00274 __m128i K3 = AES_128_key_exp(K2, 0x04); 00275 __m128i K4 = AES_128_key_exp(K3, 0x08); 00276 __m128i K5 = AES_128_key_exp(K4, 0x10); 00277 __m128i K6 = AES_128_key_exp(K5, 0x20); 00278 __m128i K7 = AES_128_key_exp(K6, 0x40); 00279 __m128i K8 = AES_128_key_exp(K7, 0x80); 00280 __m128i K9 = AES_128_key_exp(K8, 0x1B); 00281 __m128i K10 = AES_128_key_exp(K9, 0x36); 00282 00283 __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]); 00284 _mm_storeu_si128(EK_mm , K0); 00285 _mm_storeu_si128(EK_mm + 1, K1); 00286 _mm_storeu_si128(EK_mm + 2, K2); 00287 _mm_storeu_si128(EK_mm + 3, K3); 00288 _mm_storeu_si128(EK_mm + 4, K4); 00289 _mm_storeu_si128(EK_mm + 5, K5); 00290 _mm_storeu_si128(EK_mm + 6, K6); 00291 _mm_storeu_si128(EK_mm + 7, K7); 00292 _mm_storeu_si128(EK_mm + 8, K8); 00293 _mm_storeu_si128(EK_mm + 9, K9); 00294 _mm_storeu_si128(EK_mm + 10, K10); 00295 00296 // Now generate decryption keys 00297 00298 __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]); 00299 _mm_storeu_si128(DK_mm , K10); 00300 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); 00301 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); 00302 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); 00303 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); 00304 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); 00305 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); 00306 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); 00307 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); 00308 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); 00309 _mm_storeu_si128(DK_mm + 10, K0); 00310 } 00311 00312 /* 00313 * Clear memory of sensitive data 00314 */ 00315 void AES_128_NI::clear() 00316 { 00317 zap(EK); 00318 zap(DK); 00319 } 00320 00321 /* 00322 * AES-192 Encryption 00323 */ 00324 void AES_192_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const 00325 { 00326 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00327 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00328 00329 const __m128i* key_mm = reinterpret_cast<const __m128i*>(&EK[0]); 00330 00331 __m128i K0 = _mm_loadu_si128(key_mm); 00332 __m128i K1 = _mm_loadu_si128(key_mm + 1); 00333 __m128i K2 = _mm_loadu_si128(key_mm + 2); 00334 __m128i K3 = _mm_loadu_si128(key_mm + 3); 00335 __m128i K4 = _mm_loadu_si128(key_mm + 4); 00336 __m128i K5 = _mm_loadu_si128(key_mm + 5); 00337 __m128i K6 = _mm_loadu_si128(key_mm + 6); 00338 __m128i K7 = _mm_loadu_si128(key_mm + 7); 00339 __m128i K8 = _mm_loadu_si128(key_mm + 8); 00340 __m128i K9 = _mm_loadu_si128(key_mm + 9); 00341 __m128i K10 = _mm_loadu_si128(key_mm + 10); 00342 __m128i K11 = _mm_loadu_si128(key_mm + 11); 00343 __m128i K12 = _mm_loadu_si128(key_mm + 12); 00344 00345 while(blocks >= 4) 00346 { 00347 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00348 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00349 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00350 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00351 00352 B0 = _mm_xor_si128(B0, K0); 00353 B1 = _mm_xor_si128(B1, K0); 00354 B2 = _mm_xor_si128(B2, K0); 00355 B3 = _mm_xor_si128(B3, K0); 00356 00357 AES_ENC_4_ROUNDS(K1); 00358 AES_ENC_4_ROUNDS(K2); 00359 AES_ENC_4_ROUNDS(K3); 00360 AES_ENC_4_ROUNDS(K4); 00361 AES_ENC_4_ROUNDS(K5); 00362 AES_ENC_4_ROUNDS(K6); 00363 AES_ENC_4_ROUNDS(K7); 00364 AES_ENC_4_ROUNDS(K8); 00365 AES_ENC_4_ROUNDS(K9); 00366 AES_ENC_4_ROUNDS(K10); 00367 AES_ENC_4_ROUNDS(K11); 00368 AES_ENC_4_LAST_ROUNDS(K12); 00369 00370 _mm_storeu_si128(out_mm + 0, B0); 00371 _mm_storeu_si128(out_mm + 1, B1); 00372 _mm_storeu_si128(out_mm + 2, B2); 00373 _mm_storeu_si128(out_mm + 3, B3); 00374 00375 blocks -= 4; 00376 in_mm += 4; 00377 out_mm += 4; 00378 } 00379 00380 for(size_t i = 0; i != blocks; ++i) 00381 { 00382 __m128i B = _mm_loadu_si128(in_mm + i); 00383 00384 B = _mm_xor_si128(B, K0); 00385 00386 B = _mm_aesenc_si128(B, K1); 00387 B = _mm_aesenc_si128(B, K2); 00388 B = _mm_aesenc_si128(B, K3); 00389 B = _mm_aesenc_si128(B, K4); 00390 B = _mm_aesenc_si128(B, K5); 00391 B = _mm_aesenc_si128(B, K6); 00392 B = _mm_aesenc_si128(B, K7); 00393 B = _mm_aesenc_si128(B, K8); 00394 B = _mm_aesenc_si128(B, K9); 00395 B = _mm_aesenc_si128(B, K10); 00396 B = _mm_aesenc_si128(B, K11); 00397 B = _mm_aesenclast_si128(B, K12); 00398 00399 _mm_storeu_si128(out_mm + i, B); 00400 } 00401 } 00402 00403 /* 00404 * AES-192 Decryption 00405 */ 00406 void AES_192_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const 00407 { 00408 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00409 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00410 00411 const __m128i* key_mm = reinterpret_cast<const __m128i*>(&DK[0]); 00412 00413 __m128i K0 = _mm_loadu_si128(key_mm); 00414 __m128i K1 = _mm_loadu_si128(key_mm + 1); 00415 __m128i K2 = _mm_loadu_si128(key_mm + 2); 00416 __m128i K3 = _mm_loadu_si128(key_mm + 3); 00417 __m128i K4 = _mm_loadu_si128(key_mm + 4); 00418 __m128i K5 = _mm_loadu_si128(key_mm + 5); 00419 __m128i K6 = _mm_loadu_si128(key_mm + 6); 00420 __m128i K7 = _mm_loadu_si128(key_mm + 7); 00421 __m128i K8 = _mm_loadu_si128(key_mm + 8); 00422 __m128i K9 = _mm_loadu_si128(key_mm + 9); 00423 __m128i K10 = _mm_loadu_si128(key_mm + 10); 00424 __m128i K11 = _mm_loadu_si128(key_mm + 11); 00425 __m128i K12 = _mm_loadu_si128(key_mm + 12); 00426 00427 while(blocks >= 4) 00428 { 00429 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00430 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00431 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00432 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00433 00434 B0 = _mm_xor_si128(B0, K0); 00435 B1 = _mm_xor_si128(B1, K0); 00436 B2 = _mm_xor_si128(B2, K0); 00437 B3 = _mm_xor_si128(B3, K0); 00438 00439 AES_DEC_4_ROUNDS(K1); 00440 AES_DEC_4_ROUNDS(K2); 00441 AES_DEC_4_ROUNDS(K3); 00442 AES_DEC_4_ROUNDS(K4); 00443 AES_DEC_4_ROUNDS(K5); 00444 AES_DEC_4_ROUNDS(K6); 00445 AES_DEC_4_ROUNDS(K7); 00446 AES_DEC_4_ROUNDS(K8); 00447 AES_DEC_4_ROUNDS(K9); 00448 AES_DEC_4_ROUNDS(K10); 00449 AES_DEC_4_ROUNDS(K11); 00450 AES_DEC_4_LAST_ROUNDS(K12); 00451 00452 _mm_storeu_si128(out_mm + 0, B0); 00453 _mm_storeu_si128(out_mm + 1, B1); 00454 _mm_storeu_si128(out_mm + 2, B2); 00455 _mm_storeu_si128(out_mm + 3, B3); 00456 00457 blocks -= 4; 00458 in_mm += 4; 00459 out_mm += 4; 00460 } 00461 00462 for(size_t i = 0; i != blocks; ++i) 00463 { 00464 __m128i B = _mm_loadu_si128(in_mm + i); 00465 00466 B = _mm_xor_si128(B, K0); 00467 00468 B = _mm_aesdec_si128(B, K1); 00469 B = _mm_aesdec_si128(B, K2); 00470 B = _mm_aesdec_si128(B, K3); 00471 B = _mm_aesdec_si128(B, K4); 00472 B = _mm_aesdec_si128(B, K5); 00473 B = _mm_aesdec_si128(B, K6); 00474 B = _mm_aesdec_si128(B, K7); 00475 B = _mm_aesdec_si128(B, K8); 00476 B = _mm_aesdec_si128(B, K9); 00477 B = _mm_aesdec_si128(B, K10); 00478 B = _mm_aesdec_si128(B, K11); 00479 B = _mm_aesdeclast_si128(B, K12); 00480 00481 _mm_storeu_si128(out_mm + i, B); 00482 } 00483 } 00484 00485 /* 00486 * AES-192 Key Schedule 00487 */ 00488 void AES_192_NI::key_schedule(const byte key[], size_t) 00489 { 00490 EK.resize(52); 00491 DK.resize(52); 00492 00493 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); 00494 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8)); 00495 K1 = _mm_srli_si128(K1, 8); 00496 00497 load_le(&EK[0], key, 6); 00498 00499 #define AES_192_key_exp(RCON, EK_OFF) \ 00500 aes_192_key_expansion(&K0, &K1, \ 00501 _mm_aeskeygenassist_si128(K1, RCON), \ 00502 &EK[EK_OFF], EK_OFF == 48) 00503 00504 AES_192_key_exp(0x01, 6); 00505 AES_192_key_exp(0x02, 12); 00506 AES_192_key_exp(0x04, 18); 00507 AES_192_key_exp(0x08, 24); 00508 AES_192_key_exp(0x10, 30); 00509 AES_192_key_exp(0x20, 36); 00510 AES_192_key_exp(0x40, 42); 00511 AES_192_key_exp(0x80, 48); 00512 00513 #undef AES_192_key_exp 00514 00515 // Now generate decryption keys 00516 const __m128i* EK_mm = reinterpret_cast<const __m128i*>(&EK[0]); 00517 00518 __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]); 00519 _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12)); 00520 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11))); 00521 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10))); 00522 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9))); 00523 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8))); 00524 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7))); 00525 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6))); 00526 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5))); 00527 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4))); 00528 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3))); 00529 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2))); 00530 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1))); 00531 _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0)); 00532 } 00533 00534 /* 00535 * Clear memory of sensitive data 00536 */ 00537 void AES_192_NI::clear() 00538 { 00539 zap(EK); 00540 zap(DK); 00541 } 00542 00543 /* 00544 * AES-256 Encryption 00545 */ 00546 void AES_256_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const 00547 { 00548 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00549 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00550 00551 const __m128i* key_mm = reinterpret_cast<const __m128i*>(&EK[0]); 00552 00553 __m128i K0 = _mm_loadu_si128(key_mm); 00554 __m128i K1 = _mm_loadu_si128(key_mm + 1); 00555 __m128i K2 = _mm_loadu_si128(key_mm + 2); 00556 __m128i K3 = _mm_loadu_si128(key_mm + 3); 00557 __m128i K4 = _mm_loadu_si128(key_mm + 4); 00558 __m128i K5 = _mm_loadu_si128(key_mm + 5); 00559 __m128i K6 = _mm_loadu_si128(key_mm + 6); 00560 __m128i K7 = _mm_loadu_si128(key_mm + 7); 00561 __m128i K8 = _mm_loadu_si128(key_mm + 8); 00562 __m128i K9 = _mm_loadu_si128(key_mm + 9); 00563 __m128i K10 = _mm_loadu_si128(key_mm + 10); 00564 __m128i K11 = _mm_loadu_si128(key_mm + 11); 00565 __m128i K12 = _mm_loadu_si128(key_mm + 12); 00566 __m128i K13 = _mm_loadu_si128(key_mm + 13); 00567 __m128i K14 = _mm_loadu_si128(key_mm + 14); 00568 00569 while(blocks >= 4) 00570 { 00571 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00572 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00573 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00574 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00575 00576 B0 = _mm_xor_si128(B0, K0); 00577 B1 = _mm_xor_si128(B1, K0); 00578 B2 = _mm_xor_si128(B2, K0); 00579 B3 = _mm_xor_si128(B3, K0); 00580 00581 AES_ENC_4_ROUNDS(K1); 00582 AES_ENC_4_ROUNDS(K2); 00583 AES_ENC_4_ROUNDS(K3); 00584 AES_ENC_4_ROUNDS(K4); 00585 AES_ENC_4_ROUNDS(K5); 00586 AES_ENC_4_ROUNDS(K6); 00587 AES_ENC_4_ROUNDS(K7); 00588 AES_ENC_4_ROUNDS(K8); 00589 AES_ENC_4_ROUNDS(K9); 00590 AES_ENC_4_ROUNDS(K10); 00591 AES_ENC_4_ROUNDS(K11); 00592 AES_ENC_4_ROUNDS(K12); 00593 AES_ENC_4_ROUNDS(K13); 00594 AES_ENC_4_LAST_ROUNDS(K14); 00595 00596 _mm_storeu_si128(out_mm + 0, B0); 00597 _mm_storeu_si128(out_mm + 1, B1); 00598 _mm_storeu_si128(out_mm + 2, B2); 00599 _mm_storeu_si128(out_mm + 3, B3); 00600 00601 blocks -= 4; 00602 in_mm += 4; 00603 out_mm += 4; 00604 } 00605 00606 for(size_t i = 0; i != blocks; ++i) 00607 { 00608 __m128i B = _mm_loadu_si128(in_mm + i); 00609 00610 B = _mm_xor_si128(B, K0); 00611 00612 B = _mm_aesenc_si128(B, K1); 00613 B = _mm_aesenc_si128(B, K2); 00614 B = _mm_aesenc_si128(B, K3); 00615 B = _mm_aesenc_si128(B, K4); 00616 B = _mm_aesenc_si128(B, K5); 00617 B = _mm_aesenc_si128(B, K6); 00618 B = _mm_aesenc_si128(B, K7); 00619 B = _mm_aesenc_si128(B, K8); 00620 B = _mm_aesenc_si128(B, K9); 00621 B = _mm_aesenc_si128(B, K10); 00622 B = _mm_aesenc_si128(B, K11); 00623 B = _mm_aesenc_si128(B, K12); 00624 B = _mm_aesenc_si128(B, K13); 00625 B = _mm_aesenclast_si128(B, K14); 00626 00627 _mm_storeu_si128(out_mm + i, B); 00628 } 00629 } 00630 00631 /* 00632 * AES-256 Decryption 00633 */ 00634 void AES_256_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const 00635 { 00636 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00637 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00638 00639 const __m128i* key_mm = reinterpret_cast<const __m128i*>(&DK[0]); 00640 00641 __m128i K0 = _mm_loadu_si128(key_mm); 00642 __m128i K1 = _mm_loadu_si128(key_mm + 1); 00643 __m128i K2 = _mm_loadu_si128(key_mm + 2); 00644 __m128i K3 = _mm_loadu_si128(key_mm + 3); 00645 __m128i K4 = _mm_loadu_si128(key_mm + 4); 00646 __m128i K5 = _mm_loadu_si128(key_mm + 5); 00647 __m128i K6 = _mm_loadu_si128(key_mm + 6); 00648 __m128i K7 = _mm_loadu_si128(key_mm + 7); 00649 __m128i K8 = _mm_loadu_si128(key_mm + 8); 00650 __m128i K9 = _mm_loadu_si128(key_mm + 9); 00651 __m128i K10 = _mm_loadu_si128(key_mm + 10); 00652 __m128i K11 = _mm_loadu_si128(key_mm + 11); 00653 __m128i K12 = _mm_loadu_si128(key_mm + 12); 00654 __m128i K13 = _mm_loadu_si128(key_mm + 13); 00655 __m128i K14 = _mm_loadu_si128(key_mm + 14); 00656 00657 while(blocks >= 4) 00658 { 00659 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00660 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00661 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00662 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00663 00664 B0 = _mm_xor_si128(B0, K0); 00665 B1 = _mm_xor_si128(B1, K0); 00666 B2 = _mm_xor_si128(B2, K0); 00667 B3 = _mm_xor_si128(B3, K0); 00668 00669 AES_DEC_4_ROUNDS(K1); 00670 AES_DEC_4_ROUNDS(K2); 00671 AES_DEC_4_ROUNDS(K3); 00672 AES_DEC_4_ROUNDS(K4); 00673 AES_DEC_4_ROUNDS(K5); 00674 AES_DEC_4_ROUNDS(K6); 00675 AES_DEC_4_ROUNDS(K7); 00676 AES_DEC_4_ROUNDS(K8); 00677 AES_DEC_4_ROUNDS(K9); 00678 AES_DEC_4_ROUNDS(K10); 00679 AES_DEC_4_ROUNDS(K11); 00680 AES_DEC_4_ROUNDS(K12); 00681 AES_DEC_4_ROUNDS(K13); 00682 AES_DEC_4_LAST_ROUNDS(K14); 00683 00684 _mm_storeu_si128(out_mm + 0, B0); 00685 _mm_storeu_si128(out_mm + 1, B1); 00686 _mm_storeu_si128(out_mm + 2, B2); 00687 _mm_storeu_si128(out_mm + 3, B3); 00688 00689 blocks -= 4; 00690 in_mm += 4; 00691 out_mm += 4; 00692 } 00693 00694 for(size_t i = 0; i != blocks; ++i) 00695 { 00696 __m128i B = _mm_loadu_si128(in_mm + i); 00697 00698 B = _mm_xor_si128(B, K0); 00699 00700 B = _mm_aesdec_si128(B, K1); 00701 B = _mm_aesdec_si128(B, K2); 00702 B = _mm_aesdec_si128(B, K3); 00703 B = _mm_aesdec_si128(B, K4); 00704 B = _mm_aesdec_si128(B, K5); 00705 B = _mm_aesdec_si128(B, K6); 00706 B = _mm_aesdec_si128(B, K7); 00707 B = _mm_aesdec_si128(B, K8); 00708 B = _mm_aesdec_si128(B, K9); 00709 B = _mm_aesdec_si128(B, K10); 00710 B = _mm_aesdec_si128(B, K11); 00711 B = _mm_aesdec_si128(B, K12); 00712 B = _mm_aesdec_si128(B, K13); 00713 B = _mm_aesdeclast_si128(B, K14); 00714 00715 _mm_storeu_si128(out_mm + i, B); 00716 } 00717 } 00718 00719 /* 00720 * AES-256 Key Schedule 00721 */ 00722 void AES_256_NI::key_schedule(const byte key[], size_t) 00723 { 00724 EK.resize(60); 00725 DK.resize(60); 00726 00727 __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); 00728 __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16)); 00729 00730 __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01)); 00731 __m128i K3 = aes_256_key_expansion(K1, K2); 00732 00733 __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02)); 00734 __m128i K5 = aes_256_key_expansion(K3, K4); 00735 00736 __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04)); 00737 __m128i K7 = aes_256_key_expansion(K5, K6); 00738 00739 __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08)); 00740 __m128i K9 = aes_256_key_expansion(K7, K8); 00741 00742 __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10)); 00743 __m128i K11 = aes_256_key_expansion(K9, K10); 00744 00745 __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20)); 00746 __m128i K13 = aes_256_key_expansion(K11, K12); 00747 00748 __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40)); 00749 00750 __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]); 00751 _mm_storeu_si128(EK_mm , K0); 00752 _mm_storeu_si128(EK_mm + 1, K1); 00753 _mm_storeu_si128(EK_mm + 2, K2); 00754 _mm_storeu_si128(EK_mm + 3, K3); 00755 _mm_storeu_si128(EK_mm + 4, K4); 00756 _mm_storeu_si128(EK_mm + 5, K5); 00757 _mm_storeu_si128(EK_mm + 6, K6); 00758 _mm_storeu_si128(EK_mm + 7, K7); 00759 _mm_storeu_si128(EK_mm + 8, K8); 00760 _mm_storeu_si128(EK_mm + 9, K9); 00761 _mm_storeu_si128(EK_mm + 10, K10); 00762 _mm_storeu_si128(EK_mm + 11, K11); 00763 _mm_storeu_si128(EK_mm + 12, K12); 00764 _mm_storeu_si128(EK_mm + 13, K13); 00765 _mm_storeu_si128(EK_mm + 14, K14); 00766 00767 // Now generate decryption keys 00768 __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]); 00769 _mm_storeu_si128(DK_mm , K14); 00770 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13)); 00771 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12)); 00772 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11)); 00773 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10)); 00774 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9)); 00775 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8)); 00776 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7)); 00777 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6)); 00778 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5)); 00779 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4)); 00780 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3)); 00781 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2)); 00782 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1)); 00783 _mm_storeu_si128(DK_mm + 14, K0); 00784 } 00785 00786 /* 00787 * Clear memory of sensitive data 00788 */ 00789 void AES_256_NI::clear() 00790 { 00791 zap(EK); 00792 zap(DK); 00793 } 00794 00795 #undef AES_ENC_4_ROUNDS 00796 #undef AES_ENC_4_LAST_ROUNDS 00797 #undef AES_DEC_4_ROUNDS 00798 #undef AES_DEC_4_LAST_ROUNDS 00799 00800 }