Botan  1.11.15
src/lib/block/aes_ssse3/aes_ssse3.cpp
Go to the documentation of this file.
00001 /*
00002 * AES using SSSE3
00003 * (C) 2010 Jack Lloyd
00004 *
00005 * This is more or less a direct translation of public domain x86-64
00006 * assembly written by Mike Hamburg, described in "Accelerating AES
00007 * with Vector Permute Instructions" (CHES 2009). His original code is
00008 * available at http://crypto.stanford.edu/vpaes/
00009 *
00010 * Botan is released under the Simplified BSD License (see license.txt)
00011 */
00012 
00013 #include <botan/internal/block_utils.h>
00014 #include <botan/aes_ssse3.h>
00015 #include <botan/cpuid.h>
00016 #include <tmmintrin.h>
00017 
00018 namespace Botan {
00019 
00020 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_ssse3(), AES_128_SSSE3, "AES-128", "ssse3", 64);
00021 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_ssse3(), AES_192_SSSE3, "AES-192", "ssse3", 64);
00022 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_ssse3(), AES_256_SSSE3, "AES-256", "ssse3", 64);
00023 
00024 namespace {
00025 
00026 const __m128i low_nibs = _mm_set1_epi8(0x0F);
00027 
00028 const __m128i k_ipt1 = _mm_set_epi32(
00029    0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);
00030 const __m128i k_ipt2 = _mm_set_epi32(
00031    0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);
00032 
00033 const __m128i k_inv1 = _mm_set_epi32(
00034    0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);
00035 const __m128i k_inv2 = _mm_set_epi32(
00036    0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);
00037 
00038 const __m128i sb1u = _mm_set_epi32(
00039    0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);
00040 const __m128i sb1t = _mm_set_epi32(
00041    0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);
00042 
00043 const __m128i mc_forward[4] = {
00044    _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201),
00045    _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605),
00046    _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09),
00047    _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)
00048 };
00049 
00050 const __m128i sr[4] = {
00051    _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
00052    _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500),
00053    _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900),
00054    _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00),
00055 };
00056 
00057 #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z))
00058 
00059 __m128i aes_schedule_transform(__m128i input,
00060                                __m128i table_1,
00061                                __m128i table_2)
00062    {
00063    __m128i i_1 = _mm_and_si128(low_nibs, input);
00064    __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4);
00065 
00066    input = _mm_and_si128(low_nibs, input);
00067 
00068    return _mm_xor_si128(
00069       _mm_shuffle_epi8(table_1, i_1),
00070       _mm_shuffle_epi8(table_2, i_2));
00071    }
00072 
00073 __m128i aes_schedule_mangle(__m128i k, byte round_no)
00074    {
00075    __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)),
00076                                 mc_forward[0]);
00077 
00078    __m128i t2 = t;
00079 
00080    t = _mm_shuffle_epi8(t, mc_forward[0]);
00081 
00082    t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0]));
00083 
00084    return _mm_shuffle_epi8(t2, sr[round_no % 4]);
00085    }
00086 
00087 __m128i aes_schedule_192_smear(__m128i x, __m128i y)
00088    {
00089    return mm_xor3(y,
00090                   _mm_shuffle_epi32(x, 0xFE),
00091                   _mm_shuffle_epi32(y, 0x80));
00092    }
00093 
00094 __m128i aes_schedule_mangle_dec(__m128i k, byte round_no)
00095    {
00096    const __m128i dsk[8] = {
00097       _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700),
00098       _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300),
00099       _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400),
00100       _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00),
00101       _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700),
00102       _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700),
00103       _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000),
00104       _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)
00105    };
00106 
00107    __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
00108    __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
00109 
00110    t = aes_schedule_transform(t, dsk[2], dsk[3]);
00111    output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
00112 
00113    t = aes_schedule_transform(t, dsk[4], dsk[5]);
00114    output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
00115 
00116    t = aes_schedule_transform(t, dsk[6], dsk[7]);
00117    output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
00118 
00119    return _mm_shuffle_epi8(output, sr[round_no % 4]);
00120    }
00121 
00122 __m128i aes_schedule_mangle_last(__m128i k, byte round_no)
00123    {
00124    const __m128i out_tr1 = _mm_set_epi32(
00125       0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);
00126    const __m128i out_tr2 = _mm_set_epi32(
00127       0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);
00128 
00129    k = _mm_shuffle_epi8(k, sr[round_no % 4]);
00130    k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
00131    return aes_schedule_transform(k, out_tr1, out_tr2);
00132    }
00133 
00134 __m128i aes_schedule_mangle_last_dec(__m128i k)
00135    {
00136    const __m128i deskew1 = _mm_set_epi32(
00137       0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
00138    const __m128i deskew2 = _mm_set_epi32(
00139       0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);
00140 
00141    k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
00142    return aes_schedule_transform(k, deskew1, deskew2);
00143    }
00144 
00145 __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
00146    {
00147    if(rcon)
00148       {
00149       input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
00150                              input2);
00151 
00152       *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon
00153 
00154       input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
00155       input1 = _mm_alignr_epi8(input1, input1, 1);
00156       }
00157 
00158    __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
00159    smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));
00160 
00161    __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);
00162 
00163    input1 = _mm_and_si128(low_nibs, input1);
00164 
00165    __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
00166 
00167    input1 = _mm_xor_si128(input1, t);
00168 
00169    __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
00170    __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
00171 
00172    __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
00173    __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
00174 
00175    return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
00176                   _mm_shuffle_epi8(sb1t, t6),
00177                   smeared);
00178    }
00179 
00180 __m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds)
00181    {
00182    const __m128i sb2u = _mm_set_epi32(
00183       0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
00184    const __m128i sb2t = _mm_set_epi32(
00185       0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);
00186 
00187    const __m128i sbou = _mm_set_epi32(
00188       0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
00189    const __m128i sbot = _mm_set_epi32(
00190       0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);
00191 
00192    const __m128i mc_backward[4] = {
00193       _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
00194       _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
00195       _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
00196       _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
00197    };
00198 
00199    B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
00200                _mm_shuffle_epi8(k_ipt2,
00201                                 _mm_srli_epi32(
00202                                    _mm_andnot_si128(low_nibs, B),
00203                                    4)),
00204                _mm_loadu_si128(keys));
00205 
00206    for(size_t r = 1; ; ++r)
00207       {
00208       const __m128i K = _mm_loadu_si128(keys + r);
00209 
00210       __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
00211 
00212       B = _mm_and_si128(low_nibs, B);
00213 
00214       __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
00215 
00216       B = _mm_xor_si128(B, t);
00217 
00218       __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
00219       __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
00220 
00221       __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
00222       __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
00223 
00224       if(r == rounds)
00225          {
00226          B = _mm_shuffle_epi8(
00227             mm_xor3(_mm_shuffle_epi8(sbou, t5),
00228                     _mm_shuffle_epi8(sbot, t6),
00229                     K),
00230             sr[r % 4]);
00231 
00232          return B;
00233          }
00234 
00235       __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6),
00236                            _mm_shuffle_epi8(sb1u, t5),
00237                            K);
00238 
00239       __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6),
00240                            _mm_shuffle_epi8(sb2u, t5),
00241                            _mm_shuffle_epi8(t7, mc_forward[r % 4]));
00242 
00243       B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]),
00244                   _mm_shuffle_epi8(t7, mc_backward[r % 4]),
00245                   t8);
00246       }
00247    }
00248 
00249 __m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds)
00250    {
00251    const __m128i k_dipt1 = _mm_set_epi32(
00252       0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
00253    const __m128i k_dipt2 = _mm_set_epi32(
00254       0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);
00255 
00256    const __m128i sb9u = _mm_set_epi32(
00257       0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
00258    const __m128i sb9t = _mm_set_epi32(
00259       0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);
00260 
00261    const __m128i sbeu = _mm_set_epi32(
00262       0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
00263    const __m128i sbet = _mm_set_epi32(
00264       0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);
00265 
00266    const __m128i sbdu = _mm_set_epi32(
00267       0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
00268    const __m128i sbdt = _mm_set_epi32(
00269       0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);
00270 
00271    const __m128i sbbu = _mm_set_epi32(
00272       0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
00273    const __m128i sbbt = _mm_set_epi32(
00274       0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);
00275 
00276    __m128i mc = mc_forward[3];
00277 
00278    __m128i t =
00279       _mm_shuffle_epi8(k_dipt2,
00280                        _mm_srli_epi32(
00281                           _mm_andnot_si128(low_nibs, B),
00282                           4));
00283 
00284    B = mm_xor3(t, _mm_loadu_si128(keys),
00285                _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));
00286 
00287    for(size_t r = 1; ; ++r)
00288       {
00289       const __m128i K = _mm_loadu_si128(keys + r);
00290 
00291       t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
00292 
00293       B = _mm_and_si128(low_nibs, B);
00294 
00295       __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
00296 
00297       B = _mm_xor_si128(B, t);
00298 
00299       __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
00300       __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
00301       __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
00302       __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
00303 
00304       if(r == rounds)
00305          {
00306          const __m128i sbou = _mm_set_epi32(
00307             0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
00308          const __m128i sbot = _mm_set_epi32(
00309             0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);
00310 
00311          __m128i x = _mm_shuffle_epi8(sbou, t5);
00312          __m128i y = _mm_shuffle_epi8(sbot, t6);
00313          x = _mm_xor_si128(x, K);
00314          x = _mm_xor_si128(x, y);
00315 
00316          const u32bit which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
00317          return _mm_shuffle_epi8(x, sr[which_sr]);
00318          }
00319 
00320       __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
00321                                  _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
00322 
00323       __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc),
00324                            _mm_shuffle_epi8(sbdu, t5),
00325                            _mm_shuffle_epi8(sbdt, t6));
00326 
00327       __m128i t12 = _mm_xor_si128(
00328          _mm_xor_si128(
00329             _mm_shuffle_epi8(t9, mc),
00330             _mm_shuffle_epi8(sbbu, t5)),
00331          _mm_shuffle_epi8(sbbt, t6));
00332 
00333       B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
00334                                       _mm_shuffle_epi8(sbeu, t5)),
00335                         _mm_shuffle_epi8(sbet, t6));
00336 
00337       mc = _mm_alignr_epi8(mc, mc, 12);
00338       }
00339    }
00340 
00341 }
00342 
00343 /*
00344 * AES-128 Encryption
00345 */
00346 void AES_128_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const
00347    {
00348    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00349    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00350 
00351    const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]);
00352 
00353    for(size_t i = 0; i != blocks; ++i)
00354       {
00355       __m128i B = _mm_loadu_si128(in_mm + i);
00356       _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
00357       }
00358    }
00359 
00360 /*
00361 * AES-128 Decryption
00362 */
00363 void AES_128_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const
00364    {
00365    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00366    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00367 
00368    const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]);
00369 
00370    for(size_t i = 0; i != blocks; ++i)
00371       {
00372       __m128i B = _mm_loadu_si128(in_mm + i);
00373       _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
00374       }
00375    }
00376 
00377 /*
00378 * AES-128 Key Schedule
00379 */
00380 void AES_128_SSSE3::key_schedule(const byte keyb[], size_t)
00381    {
00382    __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
00383                                 0x1F8391B9, 0xAF9DEEB6);
00384 
00385    __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
00386 
00387    EK.resize(11*4);
00388    DK.resize(11*4);
00389 
00390    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
00391    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
00392 
00393    _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2]));
00394 
00395    key = aes_schedule_transform(key, k_ipt1, k_ipt2);
00396 
00397    _mm_storeu_si128(EK_mm, key);
00398 
00399    for(size_t i = 1; i != 10; ++i)
00400       {
00401       key = aes_schedule_round(&rcon, key, key);
00402 
00403       _mm_storeu_si128(EK_mm + i,
00404                        aes_schedule_mangle(key, (12-i) % 4));
00405 
00406       _mm_storeu_si128(DK_mm + (10-i),
00407                        aes_schedule_mangle_dec(key, (10-i) % 4));
00408       }
00409 
00410    key = aes_schedule_round(&rcon, key, key);
00411    _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2));
00412    _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key));
00413    }
00414 
00415 void AES_128_SSSE3::clear()
00416    {
00417    zap(EK);
00418    zap(DK);
00419    }
00420 
00421 /*
00422 * AES-192 Encryption
00423 */
00424 void AES_192_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const
00425    {
00426    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00427    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00428 
00429    const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]);
00430 
00431    for(size_t i = 0; i != blocks; ++i)
00432       {
00433       __m128i B = _mm_loadu_si128(in_mm + i);
00434       _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12));
00435       }
00436    }
00437 
00438 /*
00439 * AES-192 Decryption
00440 */
00441 void AES_192_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const
00442    {
00443    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00444    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00445 
00446    const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]);
00447 
00448    for(size_t i = 0; i != blocks; ++i)
00449       {
00450       __m128i B = _mm_loadu_si128(in_mm + i);
00451       _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12));
00452       }
00453    }
00454 
00455 /*
00456 * AES-192 Key Schedule
00457 */
00458 void AES_192_SSSE3::key_schedule(const byte keyb[], size_t)
00459    {
00460    __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
00461                                 0x1F8391B9, 0xAF9DEEB6);
00462 
00463    EK.resize(13*4);
00464    DK.resize(13*4);
00465 
00466    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
00467    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
00468 
00469    __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
00470    __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8)));
00471 
00472    _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0]));
00473 
00474    key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
00475    key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
00476 
00477    _mm_storeu_si128(EK_mm + 0, key1);
00478 
00479    // key2 with 8 high bytes masked off
00480    __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
00481 
00482    for(size_t i = 0; i != 4; ++i)
00483       {
00484       key2 = aes_schedule_round(&rcon, key2, key1);
00485 
00486       _mm_storeu_si128(EK_mm + 3*i+1,
00487                        aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
00488       _mm_storeu_si128(DK_mm + 11-3*i,
00489                        aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
00490 
00491       t = aes_schedule_192_smear(key2, t);
00492 
00493       _mm_storeu_si128(EK_mm + 3*i+2,
00494                        aes_schedule_mangle(t, (i+2)%4));
00495       _mm_storeu_si128(DK_mm + 10-3*i,
00496                        aes_schedule_mangle_dec(t, (i+2)%4));
00497 
00498       key2 = aes_schedule_round(&rcon, t, key2);
00499 
00500       if(i == 3)
00501          {
00502          _mm_storeu_si128(EK_mm + 3*i+3,
00503                           aes_schedule_mangle_last(key2, (i+1)%4));
00504          _mm_storeu_si128(DK_mm + 9-3*i,
00505                           aes_schedule_mangle_last_dec(key2));
00506          }
00507       else
00508          {
00509          _mm_storeu_si128(EK_mm + 3*i+3,
00510                           aes_schedule_mangle(key2, (i+1)%4));
00511          _mm_storeu_si128(DK_mm + 9-3*i,
00512                           aes_schedule_mangle_dec(key2, (i+1)%4));
00513          }
00514 
00515       key1 = key2;
00516       key2 = aes_schedule_192_smear(key2,
00517                                     _mm_slli_si128(_mm_srli_si128(t, 8), 8));
00518       t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
00519       }
00520    }
00521 
00522 void AES_192_SSSE3::clear()
00523    {
00524    zap(EK);
00525    zap(DK);
00526    }
00527 
00528 /*
00529 * AES-256 Encryption
00530 */
00531 void AES_256_SSSE3::encrypt_n(const byte in[], byte out[], size_t blocks) const
00532    {
00533    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00534    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00535 
00536    const __m128i* keys = reinterpret_cast<const __m128i*>(&EK[0]);
00537 
00538    for(size_t i = 0; i != blocks; ++i)
00539       {
00540       __m128i B = _mm_loadu_si128(in_mm + i);
00541       _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14));
00542       }
00543    }
00544 
00545 /*
00546 * AES-256 Decryption
00547 */
00548 void AES_256_SSSE3::decrypt_n(const byte in[], byte out[], size_t blocks) const
00549    {
00550    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00551    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00552 
00553    const __m128i* keys = reinterpret_cast<const __m128i*>(&DK[0]);
00554 
00555    for(size_t i = 0; i != blocks; ++i)
00556       {
00557       __m128i B = _mm_loadu_si128(in_mm + i);
00558       _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14));
00559       }
00560    }
00561 
00562 /*
00563 * AES-256 Key Schedule
00564 */
00565 void AES_256_SSSE3::key_schedule(const byte keyb[], size_t)
00566    {
00567    __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
00568                                 0x1F8391B9, 0xAF9DEEB6);
00569 
00570    EK.resize(15*4);
00571    DK.resize(15*4);
00572 
00573    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
00574    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
00575 
00576    __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
00577    __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16)));
00578 
00579    _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2]));
00580 
00581    key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
00582    key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
00583 
00584    _mm_storeu_si128(EK_mm + 0, key1);
00585    _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3));
00586 
00587    _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1));
00588 
00589    for(size_t i = 2; i != 14; i += 2)
00590       {
00591       __m128i k_t = key2;
00592       key1 = key2 = aes_schedule_round(&rcon, key2, key1);
00593 
00594       _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4));
00595       _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4));
00596 
00597       key2 = aes_schedule_round(nullptr, _mm_shuffle_epi32(key2, 0xFF), k_t);
00598       _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4));
00599       _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4));
00600       }
00601 
00602    key2 = aes_schedule_round(&rcon, key2, key1);
00603 
00604    _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2));
00605    _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2));
00606    }
00607 
00608 void AES_256_SSSE3::clear()
00609    {
00610    zap(EK);
00611    zap(DK);
00612    }
00613 
00614 }