Botan  1.11.15
src/lib/block/aes_ni/aes_ni.cpp
Go to the documentation of this file.
00001 /*
00002 * AES using AES-NI instructions
00003 * (C) 2009,2012 Jack Lloyd
00004 *
00005 * Botan is released under the Simplified BSD License (see license.txt)
00006 */
00007 
00008 #include <botan/internal/block_utils.h>
00009 #include <botan/aes_ni.h>
00010 #include <botan/cpuid.h>
00011 #include <wmmintrin.h>
00012 
00013 namespace Botan {
00014 
00015 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_aes_ni(), AES_128_NI, "AES-128", "aes_ni", 16);
00016 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_aes_ni(), AES_192_NI, "AES-192", "aes_ni", 16);
00017 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_aes_ni(), AES_256_NI, "AES-256", "aes_ni", 16);
00018 
00019 namespace {
00020 
00021 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
00022    {
00023    key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
00024    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00025    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00026    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00027    return _mm_xor_si128(key, key_with_rcon);
00028    }
00029 
00030 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
00031                            u32bit out[], bool last)
00032    {
00033    __m128i key1 = *K1;
00034    __m128i key2 = *K2;
00035 
00036    key2_with_rcon  = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
00037    key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
00038    key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
00039    key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
00040    key1 = _mm_xor_si128(key1, key2_with_rcon);
00041 
00042    *K1 = key1;
00043    _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1);
00044 
00045    if(last)
00046       return;
00047 
00048    key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
00049    key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
00050 
00051    *K2 = key2;
00052    out[4] = _mm_cvtsi128_si32(key2);
00053    out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
00054    }
00055 
00056 /*
00057 * The second half of the AES-256 key expansion (other half same as AES-128)
00058 */
00059 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
00060    {
00061    __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
00062    key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
00063 
00064    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00065    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00066    key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00067    return _mm_xor_si128(key, key_with_rcon);
00068    }
00069 
00070 }
00071 
00072 #define AES_ENC_4_ROUNDS(K)                     \
00073    do                                           \
00074       {                                         \
00075       B0 = _mm_aesenc_si128(B0, K);             \
00076       B1 = _mm_aesenc_si128(B1, K);             \
00077       B2 = _mm_aesenc_si128(B2, K);             \
00078       B3 = _mm_aesenc_si128(B3, K);             \
00079       } while(0)
00080 
00081 #define AES_ENC_4_LAST_ROUNDS(K)                \
00082    do                                           \
00083       {                                         \
00084       B0 = _mm_aesenclast_si128(B0, K);         \
00085       B1 = _mm_aesenclast_si128(B1, K);         \
00086       B2 = _mm_aesenclast_si128(B2, K);         \
00087       B3 = _mm_aesenclast_si128(B3, K);         \
00088       } while(0)
00089 
00090 #define AES_DEC_4_ROUNDS(K)                     \
00091    do                                           \
00092       {                                         \
00093       B0 = _mm_aesdec_si128(B0, K);             \
00094       B1 = _mm_aesdec_si128(B1, K);             \
00095       B2 = _mm_aesdec_si128(B2, K);             \
00096       B3 = _mm_aesdec_si128(B3, K);             \
00097       } while(0)
00098 
00099 #define AES_DEC_4_LAST_ROUNDS(K)                \
00100    do                                           \
00101       {                                         \
00102       B0 = _mm_aesdeclast_si128(B0, K);         \
00103       B1 = _mm_aesdeclast_si128(B1, K);         \
00104       B2 = _mm_aesdeclast_si128(B2, K);         \
00105       B3 = _mm_aesdeclast_si128(B3, K);         \
00106       } while(0)
00107 
00108 /*
00109 * AES-128 Encryption
00110 */
00111 void AES_128_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
00112    {
00113    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00114    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00115 
00116    const __m128i* key_mm = reinterpret_cast<const __m128i*>(&EK[0]);
00117 
00118    __m128i K0  = _mm_loadu_si128(key_mm);
00119    __m128i K1  = _mm_loadu_si128(key_mm + 1);
00120    __m128i K2  = _mm_loadu_si128(key_mm + 2);
00121    __m128i K3  = _mm_loadu_si128(key_mm + 3);
00122    __m128i K4  = _mm_loadu_si128(key_mm + 4);
00123    __m128i K5  = _mm_loadu_si128(key_mm + 5);
00124    __m128i K6  = _mm_loadu_si128(key_mm + 6);
00125    __m128i K7  = _mm_loadu_si128(key_mm + 7);
00126    __m128i K8  = _mm_loadu_si128(key_mm + 8);
00127    __m128i K9  = _mm_loadu_si128(key_mm + 9);
00128    __m128i K10 = _mm_loadu_si128(key_mm + 10);
00129 
00130    while(blocks >= 4)
00131       {
00132       __m128i B0 = _mm_loadu_si128(in_mm + 0);
00133       __m128i B1 = _mm_loadu_si128(in_mm + 1);
00134       __m128i B2 = _mm_loadu_si128(in_mm + 2);
00135       __m128i B3 = _mm_loadu_si128(in_mm + 3);
00136 
00137       B0 = _mm_xor_si128(B0, K0);
00138       B1 = _mm_xor_si128(B1, K0);
00139       B2 = _mm_xor_si128(B2, K0);
00140       B3 = _mm_xor_si128(B3, K0);
00141 
00142       AES_ENC_4_ROUNDS(K1);
00143       AES_ENC_4_ROUNDS(K2);
00144       AES_ENC_4_ROUNDS(K3);
00145       AES_ENC_4_ROUNDS(K4);
00146       AES_ENC_4_ROUNDS(K5);
00147       AES_ENC_4_ROUNDS(K6);
00148       AES_ENC_4_ROUNDS(K7);
00149       AES_ENC_4_ROUNDS(K8);
00150       AES_ENC_4_ROUNDS(K9);
00151       AES_ENC_4_LAST_ROUNDS(K10);
00152 
00153       _mm_storeu_si128(out_mm + 0, B0);
00154       _mm_storeu_si128(out_mm + 1, B1);
00155       _mm_storeu_si128(out_mm + 2, B2);
00156       _mm_storeu_si128(out_mm + 3, B3);
00157 
00158       blocks -= 4;
00159       in_mm += 4;
00160       out_mm += 4;
00161       }
00162 
00163    for(size_t i = 0; i != blocks; ++i)
00164       {
00165       __m128i B = _mm_loadu_si128(in_mm + i);
00166 
00167       B = _mm_xor_si128(B, K0);
00168 
00169       B = _mm_aesenc_si128(B, K1);
00170       B = _mm_aesenc_si128(B, K2);
00171       B = _mm_aesenc_si128(B, K3);
00172       B = _mm_aesenc_si128(B, K4);
00173       B = _mm_aesenc_si128(B, K5);
00174       B = _mm_aesenc_si128(B, K6);
00175       B = _mm_aesenc_si128(B, K7);
00176       B = _mm_aesenc_si128(B, K8);
00177       B = _mm_aesenc_si128(B, K9);
00178       B = _mm_aesenclast_si128(B, K10);
00179 
00180       _mm_storeu_si128(out_mm + i, B);
00181       }
00182    }
00183 
00184 /*
00185 * AES-128 Decryption
00186 */
00187 void AES_128_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
00188    {
00189    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00190    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00191 
00192    const __m128i* key_mm = reinterpret_cast<const __m128i*>(&DK[0]);
00193 
00194    __m128i K0  = _mm_loadu_si128(key_mm);
00195    __m128i K1  = _mm_loadu_si128(key_mm + 1);
00196    __m128i K2  = _mm_loadu_si128(key_mm + 2);
00197    __m128i K3  = _mm_loadu_si128(key_mm + 3);
00198    __m128i K4  = _mm_loadu_si128(key_mm + 4);
00199    __m128i K5  = _mm_loadu_si128(key_mm + 5);
00200    __m128i K6  = _mm_loadu_si128(key_mm + 6);
00201    __m128i K7  = _mm_loadu_si128(key_mm + 7);
00202    __m128i K8  = _mm_loadu_si128(key_mm + 8);
00203    __m128i K9  = _mm_loadu_si128(key_mm + 9);
00204    __m128i K10 = _mm_loadu_si128(key_mm + 10);
00205 
00206    while(blocks >= 4)
00207       {
00208       __m128i B0 = _mm_loadu_si128(in_mm + 0);
00209       __m128i B1 = _mm_loadu_si128(in_mm + 1);
00210       __m128i B2 = _mm_loadu_si128(in_mm + 2);
00211       __m128i B3 = _mm_loadu_si128(in_mm + 3);
00212 
00213       B0 = _mm_xor_si128(B0, K0);
00214       B1 = _mm_xor_si128(B1, K0);
00215       B2 = _mm_xor_si128(B2, K0);
00216       B3 = _mm_xor_si128(B3, K0);
00217 
00218       AES_DEC_4_ROUNDS(K1);
00219       AES_DEC_4_ROUNDS(K2);
00220       AES_DEC_4_ROUNDS(K3);
00221       AES_DEC_4_ROUNDS(K4);
00222       AES_DEC_4_ROUNDS(K5);
00223       AES_DEC_4_ROUNDS(K6);
00224       AES_DEC_4_ROUNDS(K7);
00225       AES_DEC_4_ROUNDS(K8);
00226       AES_DEC_4_ROUNDS(K9);
00227       AES_DEC_4_LAST_ROUNDS(K10);
00228 
00229       _mm_storeu_si128(out_mm + 0, B0);
00230       _mm_storeu_si128(out_mm + 1, B1);
00231       _mm_storeu_si128(out_mm + 2, B2);
00232       _mm_storeu_si128(out_mm + 3, B3);
00233 
00234       blocks -= 4;
00235       in_mm += 4;
00236       out_mm += 4;
00237       }
00238 
00239    for(size_t i = 0; i != blocks; ++i)
00240       {
00241       __m128i B = _mm_loadu_si128(in_mm + i);
00242 
00243       B = _mm_xor_si128(B, K0);
00244 
00245       B = _mm_aesdec_si128(B, K1);
00246       B = _mm_aesdec_si128(B, K2);
00247       B = _mm_aesdec_si128(B, K3);
00248       B = _mm_aesdec_si128(B, K4);
00249       B = _mm_aesdec_si128(B, K5);
00250       B = _mm_aesdec_si128(B, K6);
00251       B = _mm_aesdec_si128(B, K7);
00252       B = _mm_aesdec_si128(B, K8);
00253       B = _mm_aesdec_si128(B, K9);
00254       B = _mm_aesdeclast_si128(B, K10);
00255 
00256       _mm_storeu_si128(out_mm + i, B);
00257       }
00258    }
00259 
00260 /*
00261 * AES-128 Key Schedule
00262 */
00263 void AES_128_NI::key_schedule(const byte key[], size_t)
00264    {
00265    EK.resize(44);
00266    DK.resize(44);
00267 
00268    #define AES_128_key_exp(K, RCON) \
00269       aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
00270 
00271    __m128i K0  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
00272    __m128i K1  = AES_128_key_exp(K0, 0x01);
00273    __m128i K2  = AES_128_key_exp(K1, 0x02);
00274    __m128i K3  = AES_128_key_exp(K2, 0x04);
00275    __m128i K4  = AES_128_key_exp(K3, 0x08);
00276    __m128i K5  = AES_128_key_exp(K4, 0x10);
00277    __m128i K6  = AES_128_key_exp(K5, 0x20);
00278    __m128i K7  = AES_128_key_exp(K6, 0x40);
00279    __m128i K8  = AES_128_key_exp(K7, 0x80);
00280    __m128i K9  = AES_128_key_exp(K8, 0x1B);
00281    __m128i K10 = AES_128_key_exp(K9, 0x36);
00282 
00283    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
00284    _mm_storeu_si128(EK_mm     , K0);
00285    _mm_storeu_si128(EK_mm +  1, K1);
00286    _mm_storeu_si128(EK_mm +  2, K2);
00287    _mm_storeu_si128(EK_mm +  3, K3);
00288    _mm_storeu_si128(EK_mm +  4, K4);
00289    _mm_storeu_si128(EK_mm +  5, K5);
00290    _mm_storeu_si128(EK_mm +  6, K6);
00291    _mm_storeu_si128(EK_mm +  7, K7);
00292    _mm_storeu_si128(EK_mm +  8, K8);
00293    _mm_storeu_si128(EK_mm +  9, K9);
00294    _mm_storeu_si128(EK_mm + 10, K10);
00295 
00296    // Now generate decryption keys
00297 
00298    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
00299    _mm_storeu_si128(DK_mm     , K10);
00300    _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K9));
00301    _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K8));
00302    _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K7));
00303    _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K6));
00304    _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K5));
00305    _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K4));
00306    _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K3));
00307    _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K2));
00308    _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K1));
00309    _mm_storeu_si128(DK_mm + 10, K0);
00310    }
00311 
00312 /*
00313 * Clear memory of sensitive data
00314 */
00315 void AES_128_NI::clear()
00316    {
00317    zap(EK);
00318    zap(DK);
00319    }
00320 
00321 /*
00322 * AES-192 Encryption
00323 */
00324 void AES_192_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
00325    {
00326    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00327    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00328 
00329    const __m128i* key_mm = reinterpret_cast<const __m128i*>(&EK[0]);
00330 
00331    __m128i K0  = _mm_loadu_si128(key_mm);
00332    __m128i K1  = _mm_loadu_si128(key_mm + 1);
00333    __m128i K2  = _mm_loadu_si128(key_mm + 2);
00334    __m128i K3  = _mm_loadu_si128(key_mm + 3);
00335    __m128i K4  = _mm_loadu_si128(key_mm + 4);
00336    __m128i K5  = _mm_loadu_si128(key_mm + 5);
00337    __m128i K6  = _mm_loadu_si128(key_mm + 6);
00338    __m128i K7  = _mm_loadu_si128(key_mm + 7);
00339    __m128i K8  = _mm_loadu_si128(key_mm + 8);
00340    __m128i K9  = _mm_loadu_si128(key_mm + 9);
00341    __m128i K10 = _mm_loadu_si128(key_mm + 10);
00342    __m128i K11 = _mm_loadu_si128(key_mm + 11);
00343    __m128i K12 = _mm_loadu_si128(key_mm + 12);
00344 
00345    while(blocks >= 4)
00346       {
00347       __m128i B0 = _mm_loadu_si128(in_mm + 0);
00348       __m128i B1 = _mm_loadu_si128(in_mm + 1);
00349       __m128i B2 = _mm_loadu_si128(in_mm + 2);
00350       __m128i B3 = _mm_loadu_si128(in_mm + 3);
00351 
00352       B0 = _mm_xor_si128(B0, K0);
00353       B1 = _mm_xor_si128(B1, K0);
00354       B2 = _mm_xor_si128(B2, K0);
00355       B3 = _mm_xor_si128(B3, K0);
00356 
00357       AES_ENC_4_ROUNDS(K1);
00358       AES_ENC_4_ROUNDS(K2);
00359       AES_ENC_4_ROUNDS(K3);
00360       AES_ENC_4_ROUNDS(K4);
00361       AES_ENC_4_ROUNDS(K5);
00362       AES_ENC_4_ROUNDS(K6);
00363       AES_ENC_4_ROUNDS(K7);
00364       AES_ENC_4_ROUNDS(K8);
00365       AES_ENC_4_ROUNDS(K9);
00366       AES_ENC_4_ROUNDS(K10);
00367       AES_ENC_4_ROUNDS(K11);
00368       AES_ENC_4_LAST_ROUNDS(K12);
00369 
00370       _mm_storeu_si128(out_mm + 0, B0);
00371       _mm_storeu_si128(out_mm + 1, B1);
00372       _mm_storeu_si128(out_mm + 2, B2);
00373       _mm_storeu_si128(out_mm + 3, B3);
00374 
00375       blocks -= 4;
00376       in_mm += 4;
00377       out_mm += 4;
00378       }
00379 
00380    for(size_t i = 0; i != blocks; ++i)
00381       {
00382       __m128i B = _mm_loadu_si128(in_mm + i);
00383 
00384       B = _mm_xor_si128(B, K0);
00385 
00386       B = _mm_aesenc_si128(B, K1);
00387       B = _mm_aesenc_si128(B, K2);
00388       B = _mm_aesenc_si128(B, K3);
00389       B = _mm_aesenc_si128(B, K4);
00390       B = _mm_aesenc_si128(B, K5);
00391       B = _mm_aesenc_si128(B, K6);
00392       B = _mm_aesenc_si128(B, K7);
00393       B = _mm_aesenc_si128(B, K8);
00394       B = _mm_aesenc_si128(B, K9);
00395       B = _mm_aesenc_si128(B, K10);
00396       B = _mm_aesenc_si128(B, K11);
00397       B = _mm_aesenclast_si128(B, K12);
00398 
00399       _mm_storeu_si128(out_mm + i, B);
00400       }
00401    }
00402 
00403 /*
00404 * AES-192 Decryption
00405 */
00406 void AES_192_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
00407    {
00408    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00409    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00410 
00411    const __m128i* key_mm = reinterpret_cast<const __m128i*>(&DK[0]);
00412 
00413    __m128i K0  = _mm_loadu_si128(key_mm);
00414    __m128i K1  = _mm_loadu_si128(key_mm + 1);
00415    __m128i K2  = _mm_loadu_si128(key_mm + 2);
00416    __m128i K3  = _mm_loadu_si128(key_mm + 3);
00417    __m128i K4  = _mm_loadu_si128(key_mm + 4);
00418    __m128i K5  = _mm_loadu_si128(key_mm + 5);
00419    __m128i K6  = _mm_loadu_si128(key_mm + 6);
00420    __m128i K7  = _mm_loadu_si128(key_mm + 7);
00421    __m128i K8  = _mm_loadu_si128(key_mm + 8);
00422    __m128i K9  = _mm_loadu_si128(key_mm + 9);
00423    __m128i K10 = _mm_loadu_si128(key_mm + 10);
00424    __m128i K11 = _mm_loadu_si128(key_mm + 11);
00425    __m128i K12 = _mm_loadu_si128(key_mm + 12);
00426 
00427    while(blocks >= 4)
00428       {
00429       __m128i B0 = _mm_loadu_si128(in_mm + 0);
00430       __m128i B1 = _mm_loadu_si128(in_mm + 1);
00431       __m128i B2 = _mm_loadu_si128(in_mm + 2);
00432       __m128i B3 = _mm_loadu_si128(in_mm + 3);
00433 
00434       B0 = _mm_xor_si128(B0, K0);
00435       B1 = _mm_xor_si128(B1, K0);
00436       B2 = _mm_xor_si128(B2, K0);
00437       B3 = _mm_xor_si128(B3, K0);
00438 
00439       AES_DEC_4_ROUNDS(K1);
00440       AES_DEC_4_ROUNDS(K2);
00441       AES_DEC_4_ROUNDS(K3);
00442       AES_DEC_4_ROUNDS(K4);
00443       AES_DEC_4_ROUNDS(K5);
00444       AES_DEC_4_ROUNDS(K6);
00445       AES_DEC_4_ROUNDS(K7);
00446       AES_DEC_4_ROUNDS(K8);
00447       AES_DEC_4_ROUNDS(K9);
00448       AES_DEC_4_ROUNDS(K10);
00449       AES_DEC_4_ROUNDS(K11);
00450       AES_DEC_4_LAST_ROUNDS(K12);
00451 
00452       _mm_storeu_si128(out_mm + 0, B0);
00453       _mm_storeu_si128(out_mm + 1, B1);
00454       _mm_storeu_si128(out_mm + 2, B2);
00455       _mm_storeu_si128(out_mm + 3, B3);
00456 
00457       blocks -= 4;
00458       in_mm += 4;
00459       out_mm += 4;
00460       }
00461 
00462    for(size_t i = 0; i != blocks; ++i)
00463       {
00464       __m128i B = _mm_loadu_si128(in_mm + i);
00465 
00466       B = _mm_xor_si128(B, K0);
00467 
00468       B = _mm_aesdec_si128(B, K1);
00469       B = _mm_aesdec_si128(B, K2);
00470       B = _mm_aesdec_si128(B, K3);
00471       B = _mm_aesdec_si128(B, K4);
00472       B = _mm_aesdec_si128(B, K5);
00473       B = _mm_aesdec_si128(B, K6);
00474       B = _mm_aesdec_si128(B, K7);
00475       B = _mm_aesdec_si128(B, K8);
00476       B = _mm_aesdec_si128(B, K9);
00477       B = _mm_aesdec_si128(B, K10);
00478       B = _mm_aesdec_si128(B, K11);
00479       B = _mm_aesdeclast_si128(B, K12);
00480 
00481       _mm_storeu_si128(out_mm + i, B);
00482       }
00483    }
00484 
00485 /*
00486 * AES-192 Key Schedule
00487 */
00488 void AES_192_NI::key_schedule(const byte key[], size_t)
00489    {
00490    EK.resize(52);
00491    DK.resize(52);
00492 
00493    __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
00494    __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8));
00495    K1 = _mm_srli_si128(K1, 8);
00496 
00497    load_le(&EK[0], key, 6);
00498 
00499    #define AES_192_key_exp(RCON, EK_OFF)                         \
00500      aes_192_key_expansion(&K0, &K1,                             \
00501                            _mm_aeskeygenassist_si128(K1, RCON),  \
00502                            &EK[EK_OFF], EK_OFF == 48)
00503 
00504    AES_192_key_exp(0x01, 6);
00505    AES_192_key_exp(0x02, 12);
00506    AES_192_key_exp(0x04, 18);
00507    AES_192_key_exp(0x08, 24);
00508    AES_192_key_exp(0x10, 30);
00509    AES_192_key_exp(0x20, 36);
00510    AES_192_key_exp(0x40, 42);
00511    AES_192_key_exp(0x80, 48);
00512 
00513    #undef AES_192_key_exp
00514 
00515    // Now generate decryption keys
00516    const __m128i* EK_mm = reinterpret_cast<const __m128i*>(&EK[0]);
00517 
00518    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
00519    _mm_storeu_si128(DK_mm     , _mm_loadu_si128(EK_mm + 12));
00520    _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11)));
00521    _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10)));
00522    _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9)));
00523    _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8)));
00524    _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7)));
00525    _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6)));
00526    _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5)));
00527    _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4)));
00528    _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3)));
00529    _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2)));
00530    _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1)));
00531    _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0));
00532    }
00533 
00534 /*
00535 * Clear memory of sensitive data
00536 */
00537 void AES_192_NI::clear()
00538    {
00539    zap(EK);
00540    zap(DK);
00541    }
00542 
00543 /*
00544 * AES-256 Encryption
00545 */
00546 void AES_256_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
00547    {
00548    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00549    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00550 
00551    const __m128i* key_mm = reinterpret_cast<const __m128i*>(&EK[0]);
00552 
00553    __m128i K0  = _mm_loadu_si128(key_mm);
00554    __m128i K1  = _mm_loadu_si128(key_mm + 1);
00555    __m128i K2  = _mm_loadu_si128(key_mm + 2);
00556    __m128i K3  = _mm_loadu_si128(key_mm + 3);
00557    __m128i K4  = _mm_loadu_si128(key_mm + 4);
00558    __m128i K5  = _mm_loadu_si128(key_mm + 5);
00559    __m128i K6  = _mm_loadu_si128(key_mm + 6);
00560    __m128i K7  = _mm_loadu_si128(key_mm + 7);
00561    __m128i K8  = _mm_loadu_si128(key_mm + 8);
00562    __m128i K9  = _mm_loadu_si128(key_mm + 9);
00563    __m128i K10 = _mm_loadu_si128(key_mm + 10);
00564    __m128i K11 = _mm_loadu_si128(key_mm + 11);
00565    __m128i K12 = _mm_loadu_si128(key_mm + 12);
00566    __m128i K13 = _mm_loadu_si128(key_mm + 13);
00567    __m128i K14 = _mm_loadu_si128(key_mm + 14);
00568 
00569    while(blocks >= 4)
00570       {
00571       __m128i B0 = _mm_loadu_si128(in_mm + 0);
00572       __m128i B1 = _mm_loadu_si128(in_mm + 1);
00573       __m128i B2 = _mm_loadu_si128(in_mm + 2);
00574       __m128i B3 = _mm_loadu_si128(in_mm + 3);
00575 
00576       B0 = _mm_xor_si128(B0, K0);
00577       B1 = _mm_xor_si128(B1, K0);
00578       B2 = _mm_xor_si128(B2, K0);
00579       B3 = _mm_xor_si128(B3, K0);
00580 
00581       AES_ENC_4_ROUNDS(K1);
00582       AES_ENC_4_ROUNDS(K2);
00583       AES_ENC_4_ROUNDS(K3);
00584       AES_ENC_4_ROUNDS(K4);
00585       AES_ENC_4_ROUNDS(K5);
00586       AES_ENC_4_ROUNDS(K6);
00587       AES_ENC_4_ROUNDS(K7);
00588       AES_ENC_4_ROUNDS(K8);
00589       AES_ENC_4_ROUNDS(K9);
00590       AES_ENC_4_ROUNDS(K10);
00591       AES_ENC_4_ROUNDS(K11);
00592       AES_ENC_4_ROUNDS(K12);
00593       AES_ENC_4_ROUNDS(K13);
00594       AES_ENC_4_LAST_ROUNDS(K14);
00595 
00596       _mm_storeu_si128(out_mm + 0, B0);
00597       _mm_storeu_si128(out_mm + 1, B1);
00598       _mm_storeu_si128(out_mm + 2, B2);
00599       _mm_storeu_si128(out_mm + 3, B3);
00600 
00601       blocks -= 4;
00602       in_mm += 4;
00603       out_mm += 4;
00604       }
00605 
00606    for(size_t i = 0; i != blocks; ++i)
00607       {
00608       __m128i B = _mm_loadu_si128(in_mm + i);
00609 
00610       B = _mm_xor_si128(B, K0);
00611 
00612       B = _mm_aesenc_si128(B, K1);
00613       B = _mm_aesenc_si128(B, K2);
00614       B = _mm_aesenc_si128(B, K3);
00615       B = _mm_aesenc_si128(B, K4);
00616       B = _mm_aesenc_si128(B, K5);
00617       B = _mm_aesenc_si128(B, K6);
00618       B = _mm_aesenc_si128(B, K7);
00619       B = _mm_aesenc_si128(B, K8);
00620       B = _mm_aesenc_si128(B, K9);
00621       B = _mm_aesenc_si128(B, K10);
00622       B = _mm_aesenc_si128(B, K11);
00623       B = _mm_aesenc_si128(B, K12);
00624       B = _mm_aesenc_si128(B, K13);
00625       B = _mm_aesenclast_si128(B, K14);
00626 
00627       _mm_storeu_si128(out_mm + i, B);
00628       }
00629    }
00630 
00631 /*
00632 * AES-256 Decryption
00633 */
00634 void AES_256_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
00635    {
00636    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00637    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00638 
00639    const __m128i* key_mm = reinterpret_cast<const __m128i*>(&DK[0]);
00640 
00641    __m128i K0  = _mm_loadu_si128(key_mm);
00642    __m128i K1  = _mm_loadu_si128(key_mm + 1);
00643    __m128i K2  = _mm_loadu_si128(key_mm + 2);
00644    __m128i K3  = _mm_loadu_si128(key_mm + 3);
00645    __m128i K4  = _mm_loadu_si128(key_mm + 4);
00646    __m128i K5  = _mm_loadu_si128(key_mm + 5);
00647    __m128i K6  = _mm_loadu_si128(key_mm + 6);
00648    __m128i K7  = _mm_loadu_si128(key_mm + 7);
00649    __m128i K8  = _mm_loadu_si128(key_mm + 8);
00650    __m128i K9  = _mm_loadu_si128(key_mm + 9);
00651    __m128i K10 = _mm_loadu_si128(key_mm + 10);
00652    __m128i K11 = _mm_loadu_si128(key_mm + 11);
00653    __m128i K12 = _mm_loadu_si128(key_mm + 12);
00654    __m128i K13 = _mm_loadu_si128(key_mm + 13);
00655    __m128i K14 = _mm_loadu_si128(key_mm + 14);
00656 
00657    while(blocks >= 4)
00658       {
00659       __m128i B0 = _mm_loadu_si128(in_mm + 0);
00660       __m128i B1 = _mm_loadu_si128(in_mm + 1);
00661       __m128i B2 = _mm_loadu_si128(in_mm + 2);
00662       __m128i B3 = _mm_loadu_si128(in_mm + 3);
00663 
00664       B0 = _mm_xor_si128(B0, K0);
00665       B1 = _mm_xor_si128(B1, K0);
00666       B2 = _mm_xor_si128(B2, K0);
00667       B3 = _mm_xor_si128(B3, K0);
00668 
00669       AES_DEC_4_ROUNDS(K1);
00670       AES_DEC_4_ROUNDS(K2);
00671       AES_DEC_4_ROUNDS(K3);
00672       AES_DEC_4_ROUNDS(K4);
00673       AES_DEC_4_ROUNDS(K5);
00674       AES_DEC_4_ROUNDS(K6);
00675       AES_DEC_4_ROUNDS(K7);
00676       AES_DEC_4_ROUNDS(K8);
00677       AES_DEC_4_ROUNDS(K9);
00678       AES_DEC_4_ROUNDS(K10);
00679       AES_DEC_4_ROUNDS(K11);
00680       AES_DEC_4_ROUNDS(K12);
00681       AES_DEC_4_ROUNDS(K13);
00682       AES_DEC_4_LAST_ROUNDS(K14);
00683 
00684       _mm_storeu_si128(out_mm + 0, B0);
00685       _mm_storeu_si128(out_mm + 1, B1);
00686       _mm_storeu_si128(out_mm + 2, B2);
00687       _mm_storeu_si128(out_mm + 3, B3);
00688 
00689       blocks -= 4;
00690       in_mm += 4;
00691       out_mm += 4;
00692       }
00693 
00694    for(size_t i = 0; i != blocks; ++i)
00695       {
00696       __m128i B = _mm_loadu_si128(in_mm + i);
00697 
00698       B = _mm_xor_si128(B, K0);
00699 
00700       B = _mm_aesdec_si128(B, K1);
00701       B = _mm_aesdec_si128(B, K2);
00702       B = _mm_aesdec_si128(B, K3);
00703       B = _mm_aesdec_si128(B, K4);
00704       B = _mm_aesdec_si128(B, K5);
00705       B = _mm_aesdec_si128(B, K6);
00706       B = _mm_aesdec_si128(B, K7);
00707       B = _mm_aesdec_si128(B, K8);
00708       B = _mm_aesdec_si128(B, K9);
00709       B = _mm_aesdec_si128(B, K10);
00710       B = _mm_aesdec_si128(B, K11);
00711       B = _mm_aesdec_si128(B, K12);
00712       B = _mm_aesdec_si128(B, K13);
00713       B = _mm_aesdeclast_si128(B, K14);
00714 
00715       _mm_storeu_si128(out_mm + i, B);
00716       }
00717    }
00718 
00719 /*
00720 * AES-256 Key Schedule
00721 */
00722 void AES_256_NI::key_schedule(const byte key[], size_t)
00723    {
00724    EK.resize(60);
00725    DK.resize(60);
00726 
00727    __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key));
00728    __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16));
00729 
00730    __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
00731    __m128i K3 = aes_256_key_expansion(K1, K2);
00732 
00733    __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
00734    __m128i K5 = aes_256_key_expansion(K3, K4);
00735 
00736    __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
00737    __m128i K7 = aes_256_key_expansion(K5, K6);
00738 
00739    __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
00740    __m128i K9 = aes_256_key_expansion(K7, K8);
00741 
00742    __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
00743    __m128i K11 = aes_256_key_expansion(K9, K10);
00744 
00745    __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
00746    __m128i K13 = aes_256_key_expansion(K11, K12);
00747 
00748    __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
00749 
00750    __m128i* EK_mm = reinterpret_cast<__m128i*>(&EK[0]);
00751    _mm_storeu_si128(EK_mm     , K0);
00752    _mm_storeu_si128(EK_mm +  1, K1);
00753    _mm_storeu_si128(EK_mm +  2, K2);
00754    _mm_storeu_si128(EK_mm +  3, K3);
00755    _mm_storeu_si128(EK_mm +  4, K4);
00756    _mm_storeu_si128(EK_mm +  5, K5);
00757    _mm_storeu_si128(EK_mm +  6, K6);
00758    _mm_storeu_si128(EK_mm +  7, K7);
00759    _mm_storeu_si128(EK_mm +  8, K8);
00760    _mm_storeu_si128(EK_mm +  9, K9);
00761    _mm_storeu_si128(EK_mm + 10, K10);
00762    _mm_storeu_si128(EK_mm + 11, K11);
00763    _mm_storeu_si128(EK_mm + 12, K12);
00764    _mm_storeu_si128(EK_mm + 13, K13);
00765    _mm_storeu_si128(EK_mm + 14, K14);
00766 
00767    // Now generate decryption keys
00768    __m128i* DK_mm = reinterpret_cast<__m128i*>(&DK[0]);
00769    _mm_storeu_si128(DK_mm     , K14);
00770    _mm_storeu_si128(DK_mm +  1, _mm_aesimc_si128(K13));
00771    _mm_storeu_si128(DK_mm +  2, _mm_aesimc_si128(K12));
00772    _mm_storeu_si128(DK_mm +  3, _mm_aesimc_si128(K11));
00773    _mm_storeu_si128(DK_mm +  4, _mm_aesimc_si128(K10));
00774    _mm_storeu_si128(DK_mm +  5, _mm_aesimc_si128(K9));
00775    _mm_storeu_si128(DK_mm +  6, _mm_aesimc_si128(K8));
00776    _mm_storeu_si128(DK_mm +  7, _mm_aesimc_si128(K7));
00777    _mm_storeu_si128(DK_mm +  8, _mm_aesimc_si128(K6));
00778    _mm_storeu_si128(DK_mm +  9, _mm_aesimc_si128(K5));
00779    _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
00780    _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
00781    _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
00782    _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
00783    _mm_storeu_si128(DK_mm + 14, K0);
00784    }
00785 
00786 /*
00787 * Clear memory of sensitive data
00788 */
00789 void AES_256_NI::clear()
00790    {
00791    zap(EK);
00792    zap(DK);
00793    }
00794 
00795 #undef AES_ENC_4_ROUNDS
00796 #undef AES_ENC_4_LAST_ROUNDS
00797 #undef AES_DEC_4_ROUNDS
00798 #undef AES_DEC_4_LAST_ROUNDS
00799 
00800 }