Botan
1.11.15
|
00001 /* 00002 * AES using SSSE3 00003 * (C) 2010 Jack Lloyd 00004 * 00005 * This is more or less a direct translation of public domain x86-64 00006 * assembly written by Mike Hamburg, described in "Accelerating AES 00007 * with Vector Permute Instructions" (CHES 2009). His original code is 00008 * available at http://crypto.stanford.edu/vpaes/ 00009 * 00010 * Botan is released under the Simplified BSD License (see license.txt) 00011 */ 00012 00013 #include <botan/internal/block_utils.h> 00014 #include <botan/aes_ssse3.h> 00015 #include <botan/cpuid.h> 00016 #include <tmmintrin.h> 00017 00018 namespace Botan { 00019 00020 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_ssse3(), AES_128_SSSE3, "AES-128", "ssse3", 64); 00021 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_ssse3(), AES_192_SSSE3, "AES-192", "ssse3", 64); 00022 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_ssse3(), AES_256_SSSE3, "AES-256", "ssse3", 64); 00023 00024 namespace { 00025 00026 const __m128i low_nibs = _mm_set1_epi8(0x0F); 00027 00028 const __m128i k_ipt1 = _mm_set_epi32( 00029 0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000); 00030 const __m128i k_ipt2 = _mm_set_epi32( 00031 0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00); 00032 00033 const __m128i k_inv1 = _mm_set_epi32( 00034 0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180); 00035 const __m128i k_inv2 = _mm_set_epi32( 00036 0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780); 00037 00038 const __m128i sb1u = _mm_set_epi32( 00039 0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00); 00040 const __m128i sb1t = _mm_set_epi32( 00041 0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300); 00042 00043 const __m128i mc_forward[4] = { 00044 _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201), 00045 _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605), 00046 _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09), 00047 _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D) 00048 }; 00049 00050 const __m128i sr[4] = { 00051 _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100), 00052 _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500), 00053 _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900), 00054 _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00), 00055 }; 00056 00057 #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z)) 00058 00059 __m128i aes_schedule_transform(__m128i input, 00060 __m128i table_1, 00061 __m128i table_2) 00062 { 00063 __m128i i_1 = _mm_and_si128(low_nibs, input); 00064 __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4); 00065 00066 input = _mm_and_si128(low_nibs, input); 00067 00068 return _mm_xor_si128( 00069 _mm_shuffle_epi8(table_1, i_1), 00070 _mm_shuffle_epi8(table_2, i_2)); 00071 } 00072 00073 __m128i aes_schedule_mangle(__m128i k, byte round_no) 00074 { 00075 __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)), 00076 mc_forward[0]); 00077 00078 __m128i t2 = t; 00079 00080 t = _mm_shuffle_epi8(t, mc_forward[0]); 00081 00082 t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0])); 00083 00084 return _mm_shuffle_epi8(t2, sr[round_no % 4]); 00085 } 00086 00087 __m128i aes_schedule_192_smear(__m128i x, __m128i y) 00088 { 00089 return mm_xor3(y, 00090 _mm_shuffle_epi32(x, 0xFE), 00091 _mm_shuffle_epi32(y, 0x80)); 00092 } 00093 00094 __m128i aes_schedule_mangle_dec(__m128i k, byte round_no) 00095 { 00096 const __m128i dsk[8] = { 00097 _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700), 00098 _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300), 00099 _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400), 00100 _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00), 00101 _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700), 00102 _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700), 00103 _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000), 00104 _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8) 00105 }; 00106 00107 __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]); 00108 __m128i output = _mm_shuffle_epi8(t, mc_forward[0]); 00109 00110 t = aes_schedule_transform(t, dsk[2], dsk[3]); 00111 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); 00112 00113 t = aes_schedule_transform(t, dsk[4], dsk[5]); 00114 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); 00115 00116 t = aes_schedule_transform(t, dsk[6], dsk[7]); 00117 output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); 00118 00119 return _mm_shuffle_epi8(output, sr[round_no % 4]); 00120 } 00121 00122 __m128i aes_schedule_mangle_last(__m128i k, byte round_no) 00123 { 00124 const __m128i out_tr1 = _mm_set_epi32( 00125 0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000); 00126 const __m128i out_tr2 = _mm_set_epi32( 00127 0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00); 00128 00129 k = _mm_shuffle_epi8(k, sr[round_no % 4]); 00130 k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); 00131 return aes_schedule_transform(k, out_tr1, out_tr2); 00132 } 00133 00134 __m128i aes_schedule_mangle_last_dec(__m128i k) 00135 { 00136 const __m128i deskew1 = _mm_set_epi32( 00137 0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300); 00138 const __m128i deskew2 = _mm_set_epi32( 00139 0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900); 00140 00141 k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); 00142 return aes_schedule_transform(k, deskew1, deskew2); 00143 } 00144 00145 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) 00146 { 00147 if(rcon) 00148 { 00149 input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15), 00150 input2); 00151 00152 *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon 00153 00154 input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate 00155 input1 = _mm_alignr_epi8(input1, input1, 1); 00156 } 00157 00158 __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4)); 00159 smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B)); 00160 00161 __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4); 00162 00163 input1 = _mm_and_si128(low_nibs, input1); 00164 00165 __m128i t2 = _mm_shuffle_epi8(k_inv2, input1); 00166 00167 input1 = _mm_xor_si128(input1, t); 00168 00169 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); 00170 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1)); 00171 00172 __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3)); 00173 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); 00174 00175 return mm_xor3(_mm_shuffle_epi8(sb1u, t5), 00176 _mm_shuffle_epi8(sb1t, t6), 00177 smeared); 00178 } 00179 00180 __m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds) 00181 { 00182 const __m128i sb2u = _mm_set_epi32( 00183 0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400); 00184 const __m128i sb2t = _mm_set_epi32( 00185 0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900); 00186 00187 const __m128i sbou = _mm_set_epi32( 00188 0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700); 00189 const __m128i sbot = _mm_set_epi32( 00190 0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00); 00191 00192 const __m128i mc_backward[4] = { 00193 _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003), 00194 _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F), 00195 _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B), 00196 _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407), 00197 }; 00198 00199 B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)), 00200 _mm_shuffle_epi8(k_ipt2, 00201 _mm_srli_epi32( 00202 _mm_andnot_si128(low_nibs, B), 00203 4)), 00204 _mm_loadu_si128(keys)); 00205 00206 for(size_t r = 1; ; ++r) 00207 { 00208 const __m128i K = _mm_loadu_si128(keys + r); 00209 00210 __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); 00211 00212 B = _mm_and_si128(low_nibs, B); 00213 00214 __m128i t2 = _mm_shuffle_epi8(k_inv2, B); 00215 00216 B = _mm_xor_si128(B, t); 00217 00218 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); 00219 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); 00220 00221 __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); 00222 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); 00223 00224 if(r == rounds) 00225 { 00226 B = _mm_shuffle_epi8( 00227 mm_xor3(_mm_shuffle_epi8(sbou, t5), 00228 _mm_shuffle_epi8(sbot, t6), 00229 K), 00230 sr[r % 4]); 00231 00232 return B; 00233 } 00234 00235 __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6), 00236 _mm_shuffle_epi8(sb1u, t5), 00237 K); 00238 00239 __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6), 00240 _mm_shuffle_epi8(sb2u, t5), 00241 _mm_shuffle_epi8(t7, mc_forward[r % 4])); 00242 00243 B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]), 00244 _mm_shuffle_epi8(t7, mc_backward[r % 4]), 00245 t8); 00246 } 00247 } 00248 00249 __m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds) 00250 { 00251 const __m128i k_dipt1 = _mm_set_epi32( 00252 0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00); 00253 const __m128i k_dipt2 = _mm_set_epi32( 00254 0x12771772, 0xF491F194, 0x86E383E6, 0x60056500); 00255 00256 const __m128i sb9u = _mm_set_epi32( 00257 0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600); 00258 const __m128i sb9t = _mm_set_epi32( 00259 0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900); 00260 00261 const __m128i sbeu = _mm_set_epi32( 00262 0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000); 00263 const __m128i sbet = _mm_set_epi32( 00264 0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100); 00265 00266 const __m128i sbdu = _mm_set_epi32( 00267 0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200); 00268 const __m128i sbdt = _mm_set_epi32( 00269 0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00); 00270 00271 const __m128i sbbu = _mm_set_epi32( 00272 0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200); 00273 const __m128i sbbt = _mm_set_epi32( 00274 0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700); 00275 00276 __m128i mc = mc_forward[3]; 00277 00278 __m128i t = 00279 _mm_shuffle_epi8(k_dipt2, 00280 _mm_srli_epi32( 00281 _mm_andnot_si128(low_nibs, B), 00282 4)); 00283 00284 B = mm_xor3(t, _mm_loadu_si128(keys), 00285 _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs))); 00286 00287 for(size_t r = 1; ; ++r) 00288 { 00289 const __m128i K = _mm_loadu_si128(keys + r); 00290 00291 t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); 00292 00293 B = _mm_and_si128(low_nibs, B); 00294 00295 __m128i t2 = _mm_shuffle_epi8(k_inv2, B); 00296 00297 B = _mm_xor_si128(B, t); 00298 00299 __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); 00300 __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); 00301 __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); 00302 __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); 00303 00304 if(r == rounds) 00305 { 00306 const __m128i sbou = _mm_set_epi32( 00307 0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000); 00308 const __m128i sbot = _mm_set_epi32( 00309 0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00); 00310 00311 __m128i x = _mm_shuffle_epi8(sbou, t5); 00312 __m128i y = _mm_shuffle_epi8(sbot, t6); 00313 x = _mm_xor_si128(x, K); 00314 x = _mm_xor_si128(x, y); 00315 00316 const u32bit which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16; 00317 return _mm_shuffle_epi8(x, sr[which_sr]); 00318 } 00319 00320 __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6), 00321 _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K)); 00322 00323 __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc), 00324 _mm_shuffle_epi8(sbdu, t5), 00325 _mm_shuffle_epi8(sbdt, t6)); 00326 00327 __m128i t12 = _mm_xor_si128( 00328 _mm_xor_si128( 00329 _mm_shuffle_epi8(t9, mc), 00330 _mm_shuffle_epi8(sbbu, t5)), 00331 _mm_shuffle_epi8(sbbt, t6)); 00332 00333 B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc), 00334 _mm_shuffle_epi8(sbeu, t5)), 00335 _mm_shuffle_epi8(sbet, t6)); 00336 00337 mc = _mm_alignr_epi8(mc, mc, 12); 00338 } 00339 } 00340 00341 } 00342 00343 /* 00344 * AES-128 Encryption 00345 */ 00346 void AES_128_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const 00347 { 00348 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00349 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00350 00351 const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]); 00352 00353 for(size_t i = 0; i != blocks; ++i) 00354 { 00355 __m128i B = _mm_loadu_si128(in_mm + i); 00356 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10)); 00357 } 00358 } 00359 00360 /* 00361 * AES-128 Decryption 00362 */ 00363 void AES_128_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const 00364 { 00365 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00366 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00367 00368 const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]); 00369 00370 for(size_t i = 0; i != blocks; ++i) 00371 { 00372 __m128i B = _mm_loadu_si128(in_mm + i); 00373 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10)); 00374 } 00375 } 00376 00377 /* 00378 * AES-128 Key Schedule 00379 */ 00380 void AES_128_SSSE3::key_schedule(const byte keyb[], size_t) 00381 { 00382 __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, 00383 0x1F8391B9, 0xAF9DEEB6); 00384 00385 __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb)); 00386 00387 EK.resize(11*4); 00388 DK.resize(11*4); 00389 00390 __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]); 00391 __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]); 00392 00393 _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2])); 00394 00395 key = aes_schedule_transform(key, k_ipt1, k_ipt2); 00396 00397 _mm_storeu_si128(EK_mm, key); 00398 00399 for(size_t i = 1; i != 10; ++i) 00400 { 00401 key = aes_schedule_round(&rcon, key, key); 00402 00403 _mm_storeu_si128(EK_mm + i, 00404 aes_schedule_mangle(key, (12-i) % 4)); 00405 00406 _mm_storeu_si128(DK_mm + (10-i), 00407 aes_schedule_mangle_dec(key, (10-i) % 4)); 00408 } 00409 00410 key = aes_schedule_round(&rcon, key, key); 00411 _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2)); 00412 _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key)); 00413 } 00414 00415 void AES_128_SSSE3::clear() 00416 { 00417 zap(EK); 00418 zap(DK); 00419 } 00420 00421 /* 00422 * AES-192 Encryption 00423 */ 00424 void AES_192_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const 00425 { 00426 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00427 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00428 00429 const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]); 00430 00431 for(size_t i = 0; i != blocks; ++i) 00432 { 00433 __m128i B = _mm_loadu_si128(in_mm + i); 00434 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12)); 00435 } 00436 } 00437 00438 /* 00439 * AES-192 Decryption 00440 */ 00441 void AES_192_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const 00442 { 00443 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00444 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00445 00446 const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]); 00447 00448 for(size_t i = 0; i != blocks; ++i) 00449 { 00450 __m128i B = _mm_loadu_si128(in_mm + i); 00451 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12)); 00452 } 00453 } 00454 00455 /* 00456 * AES-192 Key Schedule 00457 */ 00458 void AES_192_SSSE3::key_schedule(const byte keyb[], size_t) 00459 { 00460 __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, 00461 0x1F8391B9, 0xAF9DEEB6); 00462 00463 EK.resize(13*4); 00464 DK.resize(13*4); 00465 00466 __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]); 00467 __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]); 00468 00469 __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb)); 00470 __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8))); 00471 00472 _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0])); 00473 00474 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); 00475 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); 00476 00477 _mm_storeu_si128(EK_mm + 0, key1); 00478 00479 // key2 with 8 high bytes masked off 00480 __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8); 00481 00482 for(size_t i = 0; i != 4; ++i) 00483 { 00484 key2 = aes_schedule_round(&rcon, key2, key1); 00485 00486 _mm_storeu_si128(EK_mm + 3*i+1, 00487 aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4)); 00488 _mm_storeu_si128(DK_mm + 11-3*i, 00489 aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4)); 00490 00491 t = aes_schedule_192_smear(key2, t); 00492 00493 _mm_storeu_si128(EK_mm + 3*i+2, 00494 aes_schedule_mangle(t, (i+2)%4)); 00495 _mm_storeu_si128(DK_mm + 10-3*i, 00496 aes_schedule_mangle_dec(t, (i+2)%4)); 00497 00498 key2 = aes_schedule_round(&rcon, t, key2); 00499 00500 if(i == 3) 00501 { 00502 _mm_storeu_si128(EK_mm + 3*i+3, 00503 aes_schedule_mangle_last(key2, (i+1)%4)); 00504 _mm_storeu_si128(DK_mm + 9-3*i, 00505 aes_schedule_mangle_last_dec(key2)); 00506 } 00507 else 00508 { 00509 _mm_storeu_si128(EK_mm + 3*i+3, 00510 aes_schedule_mangle(key2, (i+1)%4)); 00511 _mm_storeu_si128(DK_mm + 9-3*i, 00512 aes_schedule_mangle_dec(key2, (i+1)%4)); 00513 } 00514 00515 key1 = key2; 00516 key2 = aes_schedule_192_smear(key2, 00517 _mm_slli_si128(_mm_srli_si128(t, 8), 8)); 00518 t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8); 00519 } 00520 } 00521 00522 void AES_192_SSSE3::clear() 00523 { 00524 zap(EK); 00525 zap(DK); 00526 } 00527 00528 /* 00529 * AES-256 Encryption 00530 */ 00531 void AES_256_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const 00532 { 00533 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00534 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00535 00536 const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]); 00537 00538 for(size_t i = 0; i != blocks; ++i) 00539 { 00540 __m128i B = _mm_loadu_si128(in_mm + i); 00541 _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14)); 00542 } 00543 } 00544 00545 /* 00546 * AES-256 Decryption 00547 */ 00548 void AES_256_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const 00549 { 00550 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00551 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00552 00553 const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]); 00554 00555 for(size_t i = 0; i != blocks; ++i) 00556 { 00557 __m128i B = _mm_loadu_si128(in_mm + i); 00558 _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14)); 00559 } 00560 } 00561 00562 /* 00563 * AES-256 Key Schedule 00564 */ 00565 void AES_256_SSSE3::key_schedule(const byte keyb[], size_t) 00566 { 00567 __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, 00568 0x1F8391B9, 0xAF9DEEB6); 00569 00570 EK.resize(15*4); 00571 DK.resize(15*4); 00572 00573 __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]); 00574 __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]); 00575 00576 __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb)); 00577 __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16))); 00578 00579 _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2])); 00580 00581 key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); 00582 key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); 00583 00584 _mm_storeu_si128(EK_mm + 0, key1); 00585 _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3)); 00586 00587 _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1)); 00588 00589 for(size_t i = 2; i != 14; i += 2) 00590 { 00591 __m128i k_t = key2; 00592 key1 = key2 = aes_schedule_round(&rcon, key2, key1); 00593 00594 _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4)); 00595 _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4)); 00596 00597 key2 = aes_schedule_round(nullptr, _mm_shuffle_epi32(key2, 0xFF), k_t); 00598 _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4)); 00599 _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4)); 00600 } 00601 00602 key2 = aes_schedule_round(&rcon, key2, key1); 00603 00604 _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2)); 00605 _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2)); 00606 } 00607 00608 void AES_256_SSSE3::clear() 00609 { 00610 zap(EK); 00611 zap(DK); 00612 } 00613 00614 }