Botan
1.11.15
|
00001 /* 00002 * IDEA in SSE2 00003 * (C) 2009 Jack Lloyd 00004 * 00005 * Botan is released under the Simplified BSD License (see license.txt) 00006 */ 00007 00008 #include <botan/internal/block_utils.h> 00009 #include <botan/idea_sse2.h> 00010 #include <botan/cpuid.h> 00011 #include <emmintrin.h> 00012 00013 namespace Botan { 00014 00015 BOTAN_REGISTER_BLOCK_CIPHER_NOARGS_IF(CPUID::has_sse2(), IDEA_SSE2, "IDEA", "sse2", 64); 00016 00017 namespace { 00018 00019 inline __m128i mul(__m128i X, u16bit K_16) 00020 { 00021 const __m128i zeros = _mm_set1_epi16(0); 00022 const __m128i ones = _mm_set1_epi16(1); 00023 00024 const __m128i K = _mm_set1_epi16(K_16); 00025 00026 const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros); 00027 const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros); 00028 00029 const __m128i mul_lo = _mm_mullo_epi16(X, K); 00030 const __m128i mul_hi = _mm_mulhi_epu16(X, K); 00031 00032 __m128i T = _mm_sub_epi16(mul_lo, mul_hi); 00033 00034 // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0 00035 const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo); 00036 const __m128i cmp = _mm_min_epu8( 00037 _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones); 00038 00039 T = _mm_add_epi16(T, cmp); 00040 00041 /* Selection: if X[i] is zero then assign 1-K 00042 if K is zero then assign 1-X[i] 00043 00044 Could if() off value of K_16 for the second, but this gives a 00045 constant time implementation which is a nice bonus. 00046 */ 00047 00048 T = _mm_or_si128( 00049 _mm_andnot_si128(X_is_zero, T), 00050 _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero)); 00051 00052 T = _mm_or_si128( 00053 _mm_andnot_si128(K_is_zero, T), 00054 _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero)); 00055 00056 return T; 00057 } 00058 00059 /* 00060 * 4x8 matrix transpose 00061 * 00062 * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in 00063 * transpose_out doesn't need it. Something with the shuffle? Removing 00064 * that extra unpack could easily save 3-4 cycles per block, and would 00065 * also help a lot with register pressure on 32-bit x86 00066 */ 00067 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) 00068 { 00069 __m128i T0 = _mm_unpackhi_epi32(B0, B1); 00070 __m128i T1 = _mm_unpacklo_epi32(B0, B1); 00071 __m128i T2 = _mm_unpackhi_epi32(B2, B3); 00072 __m128i T3 = _mm_unpacklo_epi32(B2, B3); 00073 00074 __m128i T4 = _mm_unpacklo_epi32(T0, T1); 00075 __m128i T5 = _mm_unpackhi_epi32(T0, T1); 00076 __m128i T6 = _mm_unpacklo_epi32(T2, T3); 00077 __m128i T7 = _mm_unpackhi_epi32(T2, T3); 00078 00079 T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2)); 00080 T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2)); 00081 T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2)); 00082 T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2)); 00083 00084 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2)); 00085 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2)); 00086 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2)); 00087 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2)); 00088 00089 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); 00090 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); 00091 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); 00092 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); 00093 00094 B0 = _mm_unpacklo_epi64(T0, T2); 00095 B1 = _mm_unpackhi_epi64(T0, T2); 00096 B2 = _mm_unpacklo_epi64(T1, T3); 00097 B3 = _mm_unpackhi_epi64(T1, T3); 00098 } 00099 00100 /* 00101 * 4x8 matrix transpose (reverse) 00102 */ 00103 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) 00104 { 00105 __m128i T0 = _mm_unpacklo_epi64(B0, B1); 00106 __m128i T1 = _mm_unpacklo_epi64(B2, B3); 00107 __m128i T2 = _mm_unpackhi_epi64(B0, B1); 00108 __m128i T3 = _mm_unpackhi_epi64(B2, B3); 00109 00110 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); 00111 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); 00112 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); 00113 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); 00114 00115 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); 00116 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); 00117 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); 00118 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); 00119 00120 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); 00121 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); 00122 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); 00123 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); 00124 00125 B0 = _mm_unpacklo_epi32(T0, T1); 00126 B1 = _mm_unpackhi_epi32(T0, T1); 00127 B2 = _mm_unpacklo_epi32(T2, T3); 00128 B3 = _mm_unpackhi_epi32(T2, T3); 00129 } 00130 00131 /* 00132 * IDEA encryption/decryption in SSE2 00133 */ 00134 void idea_op_8(const byte in[64], byte out[64], const u16bit EK[52]) 00135 { 00136 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); 00137 00138 __m128i B0 = _mm_loadu_si128(in_mm + 0); 00139 __m128i B1 = _mm_loadu_si128(in_mm + 1); 00140 __m128i B2 = _mm_loadu_si128(in_mm + 2); 00141 __m128i B3 = _mm_loadu_si128(in_mm + 3); 00142 00143 transpose_in(B0, B1, B2, B3); 00144 00145 // byte swap 00146 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8)); 00147 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8)); 00148 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8)); 00149 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8)); 00150 00151 for(size_t i = 0; i != 8; ++i) 00152 { 00153 B0 = mul(B0, EK[6*i+0]); 00154 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1])); 00155 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2])); 00156 B3 = mul(B3, EK[6*i+3]); 00157 00158 __m128i T0 = B2; 00159 00160 B2 = _mm_xor_si128(B2, B0); 00161 B2 = mul(B2, EK[6*i+4]); 00162 00163 __m128i T1 = B1; 00164 00165 B1 = _mm_xor_si128(B1, B3); 00166 B1 = _mm_add_epi16(B1, B2); 00167 B1 = mul(B1, EK[6*i+5]); 00168 00169 B2 = _mm_add_epi16(B2, B1); 00170 00171 B0 = _mm_xor_si128(B0, B1); 00172 B1 = _mm_xor_si128(B1, T0); 00173 B3 = _mm_xor_si128(B3, B2); 00174 B2 = _mm_xor_si128(B2, T1); 00175 } 00176 00177 B0 = mul(B0, EK[48]); 00178 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50])); 00179 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49])); 00180 B3 = mul(B3, EK[51]); 00181 00182 // byte swap 00183 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8)); 00184 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8)); 00185 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8)); 00186 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8)); 00187 00188 transpose_out(B0, B2, B1, B3); 00189 00190 __m128i* out_mm = reinterpret_cast<__m128i*>(out); 00191 00192 _mm_storeu_si128(out_mm + 0, B0); 00193 _mm_storeu_si128(out_mm + 1, B2); 00194 _mm_storeu_si128(out_mm + 2, B1); 00195 _mm_storeu_si128(out_mm + 3, B3); 00196 } 00197 00198 } 00199 00200 /* 00201 * IDEA Encryption 00202 */ 00203 void IDEA_SSE2::encrypt_n(const byte in[], byte out[], size_t blocks) const 00204 { 00205 const u16bit* KS = &this->get_EK()[0]; 00206 00207 while(blocks >= 8) 00208 { 00209 idea_op_8(in, out, KS); 00210 in += 8 * BLOCK_SIZE; 00211 out += 8 * BLOCK_SIZE; 00212 blocks -= 8; 00213 } 00214 00215 if(blocks) 00216 IDEA::encrypt_n(in, out, blocks); 00217 } 00218 00219 /* 00220 * IDEA Decryption 00221 */ 00222 void IDEA_SSE2::decrypt_n(const byte in[], byte out[], size_t blocks) const 00223 { 00224 const u16bit* KS = &this->get_DK()[0]; 00225 00226 while(blocks >= 8) 00227 { 00228 idea_op_8(in, out, KS); 00229 in += 8 * BLOCK_SIZE; 00230 out += 8 * BLOCK_SIZE; 00231 blocks -= 8; 00232 } 00233 00234 if(blocks) 00235 IDEA::decrypt_n(in, out, blocks); 00236 } 00237 00238 }