Botan  1.11.15
src/lib/block/idea_sse2/idea_sse2.cpp
Go to the documentation of this file.
00001 /*
00002 * IDEA in SSE2
00003 * (C) 2009 Jack Lloyd
00004 *
00005 * Botan is released under the Simplified BSD License (see license.txt)
00006 */
00007 
00008 #include <botan/internal/block_utils.h>
00009 #include <botan/idea_sse2.h>
00010 #include <botan/cpuid.h>
00011 #include <emmintrin.h>
00012 
00013 namespace Botan {
00014 
00015 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_sse2(), IDEA_SSE2, "IDEA", "sse2", 64);
00016 
00017 namespace {
00018 
00019 inline __m128i mul(__m128i X, u16bit K_16)
00020    {
00021    const __m128i zeros = _mm_set1_epi16(0);
00022    const __m128i ones = _mm_set1_epi16(1);
00023 
00024    const __m128i K = _mm_set1_epi16(K_16);
00025 
00026    const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
00027    const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
00028 
00029    const __m128i mul_lo = _mm_mullo_epi16(X, K);
00030    const __m128i mul_hi = _mm_mulhi_epu16(X, K);
00031 
00032    __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
00033 
00034    // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
00035    const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
00036    const __m128i cmp = _mm_min_epu8(
00037      _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
00038 
00039    T = _mm_add_epi16(T, cmp);
00040 
00041    /* Selection: if X[i] is zero then assign 1-K
00042                  if K is zero then assign 1-X[i]
00043 
00044       Could if() off value of K_16 for the second, but this gives a
00045       constant time implementation which is a nice bonus.
00046    */
00047 
00048    T = _mm_or_si128(
00049       _mm_andnot_si128(X_is_zero, T),
00050       _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
00051 
00052    T = _mm_or_si128(
00053       _mm_andnot_si128(K_is_zero, T),
00054       _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
00055 
00056    return T;
00057    }
00058 
00059 /*
00060 * 4x8 matrix transpose
00061 *
00062 * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
00063 * transpose_out doesn't need it. Something with the shuffle? Removing
00064 * that extra unpack could easily save 3-4 cycles per block, and would
00065 * also help a lot with register pressure on 32-bit x86
00066 */
00067 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
00068    {
00069    __m128i T0 = _mm_unpackhi_epi32(B0, B1);
00070    __m128i T1 = _mm_unpacklo_epi32(B0, B1);
00071    __m128i T2 = _mm_unpackhi_epi32(B2, B3);
00072    __m128i T3 = _mm_unpacklo_epi32(B2, B3);
00073 
00074    __m128i T4 = _mm_unpacklo_epi32(T0, T1);
00075    __m128i T5 = _mm_unpackhi_epi32(T0, T1);
00076    __m128i T6 = _mm_unpacklo_epi32(T2, T3);
00077    __m128i T7 = _mm_unpackhi_epi32(T2, T3);
00078 
00079    T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
00080    T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
00081    T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
00082    T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
00083 
00084    T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
00085    T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
00086    T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
00087    T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
00088 
00089    T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
00090    T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
00091    T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
00092    T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
00093 
00094    B0 = _mm_unpacklo_epi64(T0, T2);
00095    B1 = _mm_unpackhi_epi64(T0, T2);
00096    B2 = _mm_unpacklo_epi64(T1, T3);
00097    B3 = _mm_unpackhi_epi64(T1, T3);
00098    }
00099 
00100 /*
00101 * 4x8 matrix transpose (reverse)
00102 */
00103 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
00104    {
00105    __m128i T0 = _mm_unpacklo_epi64(B0, B1);
00106    __m128i T1 = _mm_unpacklo_epi64(B2, B3);
00107    __m128i T2 = _mm_unpackhi_epi64(B0, B1);
00108    __m128i T3 = _mm_unpackhi_epi64(B2, B3);
00109 
00110    T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
00111    T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
00112    T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
00113    T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
00114 
00115    T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
00116    T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
00117    T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
00118    T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
00119 
00120    T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
00121    T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
00122    T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
00123    T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
00124 
00125    B0 = _mm_unpacklo_epi32(T0, T1);
00126    B1 = _mm_unpackhi_epi32(T0, T1);
00127    B2 = _mm_unpacklo_epi32(T2, T3);
00128    B3 = _mm_unpackhi_epi32(T2, T3);
00129    }
00130 
00131 /*
00132 * IDEA encryption/decryption in SSE2
00133 */
00134 void idea_op_8(const byte in[64], byte out[64], const u16bit EK[52])
00135    {
00136    const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
00137 
00138    __m128i B0 = _mm_loadu_si128(in_mm + 0);
00139    __m128i B1 = _mm_loadu_si128(in_mm + 1);
00140    __m128i B2 = _mm_loadu_si128(in_mm + 2);
00141    __m128i B3 = _mm_loadu_si128(in_mm + 3);
00142 
00143    transpose_in(B0, B1, B2, B3);
00144 
00145    // byte swap
00146    B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
00147    B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
00148    B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
00149    B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
00150 
00151    for(size_t i = 0; i != 8; ++i)
00152       {
00153       B0 = mul(B0, EK[6*i+0]);
00154       B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
00155       B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
00156       B3 = mul(B3, EK[6*i+3]);
00157 
00158       __m128i T0 = B2;
00159 
00160       B2 = _mm_xor_si128(B2, B0);
00161       B2 = mul(B2, EK[6*i+4]);
00162 
00163       __m128i T1 = B1;
00164 
00165       B1 = _mm_xor_si128(B1, B3);
00166       B1 = _mm_add_epi16(B1, B2);
00167       B1 = mul(B1, EK[6*i+5]);
00168 
00169       B2 = _mm_add_epi16(B2, B1);
00170 
00171       B0 = _mm_xor_si128(B0, B1);
00172       B1 = _mm_xor_si128(B1, T0);
00173       B3 = _mm_xor_si128(B3, B2);
00174       B2 = _mm_xor_si128(B2, T1);
00175       }
00176 
00177    B0 = mul(B0, EK[48]);
00178    B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
00179    B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
00180    B3 = mul(B3, EK[51]);
00181 
00182    // byte swap
00183    B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
00184    B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
00185    B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
00186    B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
00187 
00188    transpose_out(B0, B2, B1, B3);
00189 
00190    __m128i* out_mm = reinterpret_cast<__m128i*>(out);
00191 
00192    _mm_storeu_si128(out_mm + 0, B0);
00193    _mm_storeu_si128(out_mm + 1, B2);
00194    _mm_storeu_si128(out_mm + 2, B1);
00195    _mm_storeu_si128(out_mm + 3, B3);
00196    }
00197 
00198 }
00199 
00200 /*
00201 * IDEA Encryption
00202 */
00203 void IDEA_SSE2::encrypt_n(const byte in[], byte out[], size_t blocks) const
00204    {
00205    const u16bit* KS = &this->get_EK()[0];
00206 
00207    while(blocks >= 8)
00208       {
00209       idea_op_8(in, out, KS);
00210       in += 8 * BLOCK_SIZE;
00211       out += 8 * BLOCK_SIZE;
00212       blocks -= 8;
00213       }
00214 
00215    if(blocks)
00216      IDEA::encrypt_n(in, out, blocks);
00217    }
00218 
00219 /*
00220 * IDEA Decryption
00221 */
00222 void IDEA_SSE2::decrypt_n(const byte in[], byte out[], size_t blocks) const
00223    {
00224    const u16bit* KS = &this->get_DK()[0];
00225 
00226    while(blocks >= 8)
00227       {
00228       idea_op_8(in, out, KS);
00229       in += 8 * BLOCK_SIZE;
00230       out += 8 * BLOCK_SIZE;
00231       blocks -= 8;
00232       }
00233 
00234    if(blocks)
00235      IDEA::decrypt_n(in, out, blocks);
00236    }
00237 
00238 }