Botan  1.11.15
src/lib/hash/sha1_sse2/sha1_sse2.cpp
Go to the documentation of this file.
00001 /*
00002 * SHA-1 using SSE2
00003 * Based on public domain code by Dean Gaudet
00004 *    (http://arctic.org/~dean/crypto/sha1.html)
00005 * (C) 2009-2011 Jack Lloyd
00006 *
00007 * Botan is released under the Simplified BSD License (see license.txt)
00008 */
00009 
00010 #include <botan/internal/hash_utils.h>
00011 #include <botan/sha1_sse2.h>
00012 #include <botan/cpuid.h>
00013 #include <emmintrin.h>
00014 
00015 namespace Botan {
00016 
00017 BOTAN_REGISTER_HASH_NOARGS_IF(CPUID::has_sse2(), SHA_160_SSE2, "SHA-160", "sse2", 64);
00018 
00019 namespace SHA1_SSE2_F {
00020 
00021 namespace {
00022 
00023 /*
00024 * First 16 bytes just need byte swapping. Preparing just means
00025 * adding in the round constants.
00026 */
00027 
00028 #define prep00_15(P, W)                                      \
00029    do {                                                      \
00030       W = _mm_shufflehi_epi16(W, _MM_SHUFFLE(2, 3, 0, 1));   \
00031       W = _mm_shufflelo_epi16(W, _MM_SHUFFLE(2, 3, 0, 1));   \
00032       W = _mm_or_si128(_mm_slli_epi16(W, 8),                 \
00033                        _mm_srli_epi16(W, 8));                \
00034       P.u128 = _mm_add_epi32(W, K00_19);                     \
00035    } while(0)
00036 
00037 /*
00038 For each multiple of 4, t, we want to calculate this:
00039 
00040 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
00041 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
00042 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
00043 W[t+3] = rol(W[t]   ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
00044 
00045 we'll actually calculate this:
00046 
00047 W[t+0] = rol(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
00048 W[t+1] = rol(W[t-2] ^ W[t-7] ^ W[t-13] ^ W[t-15], 1);
00049 W[t+2] = rol(W[t-1] ^ W[t-6] ^ W[t-12] ^ W[t-14], 1);
00050 W[t+3] = rol(  0    ^ W[t-5] ^ W[t-11] ^ W[t-13], 1);
00051 W[t+3] ^= rol(W[t+0], 1);
00052 
00053 the parameters are:
00054 
00055 W0 = &W[t-16];
00056 W1 = &W[t-12];
00057 W2 = &W[t- 8];
00058 W3 = &W[t- 4];
00059 
00060 and on output:
00061 prepared = W0 + K
00062 W0 = W[t]..W[t+3]
00063 */
00064 
00065 /* note that there is a step here where i want to do a rol by 1, which
00066 * normally would look like this:
00067 *
00068 * r1 = psrld r0,$31
00069 * r0 = pslld r0,$1
00070 * r0 = por r0,r1
00071 *
00072 * but instead i do this:
00073 *
00074 * r1 = pcmpltd r0,zero
00075 * r0 = paddd r0,r0
00076 * r0 = psub r0,r1
00077 *
00078 * because pcmpltd and paddd are availabe in both MMX units on
00079 * efficeon, pentium-m, and opteron but shifts are available in
00080 * only one unit.
00081 */
00082 #define prep(prep, XW0, XW1, XW2, XW3, K)                               \
00083    do {                                                                 \
00084       __m128i r0, r1, r2, r3;                                           \
00085                                                                         \
00086       /* load W[t-4] 16-byte aligned, and shift */                      \
00087       r3 = _mm_srli_si128((XW3), 4);                                    \
00088       r0 = (XW0);                                                       \
00089       /* get high 64-bits of XW0 into low 64-bits */                    \
00090       r1 = _mm_shuffle_epi32((XW0), _MM_SHUFFLE(1,0,3,2));              \
00091       /* load high 64-bits of r1 */                                     \
00092       r1 = _mm_unpacklo_epi64(r1, (XW1));                               \
00093       r2 = (XW2);                                                       \
00094                                                                         \
00095       r0 = _mm_xor_si128(r1, r0);                                       \
00096       r2 = _mm_xor_si128(r3, r2);                                       \
00097       r0 = _mm_xor_si128(r2, r0);                                       \
00098       /* unrotated W[t]..W[t+2] in r0 ... still need W[t+3] */          \
00099                                                                         \
00100       r2 = _mm_slli_si128(r0, 12);                                      \
00101       r1 = _mm_cmplt_epi32(r0, _mm_setzero_si128());                    \
00102       r0 = _mm_add_epi32(r0, r0);   /* shift left by 1 */               \
00103       r0 = _mm_sub_epi32(r0, r1);   /* r0 has W[t]..W[t+2] */           \
00104                                                                         \
00105       r3 = _mm_srli_epi32(r2, 30);                                      \
00106       r2 = _mm_slli_epi32(r2, 2);                                       \
00107                                                                         \
00108       r0 = _mm_xor_si128(r0, r3);                                       \
00109       r0 = _mm_xor_si128(r0, r2);   /* r0 now has W[t+3] */             \
00110                                                                         \
00111       (XW0) = r0;                                                       \
00112       (prep).u128 = _mm_add_epi32(r0, K);                               \
00113    } while(0)
00114 
00115 /*
00116 * SHA-160 F1 Function
00117 */
00118 inline void F1(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E, u32bit msg)
00119    {
00120    E += (D ^ (B & (C ^ D))) + msg + rotate_left(A, 5);
00121    B  = rotate_left(B, 30);
00122    }
00123 
00124 /*
00125 * SHA-160 F2 Function
00126 */
00127 inline void F2(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E, u32bit msg)
00128    {
00129    E += (B ^ C ^ D) + msg + rotate_left(A, 5);
00130    B  = rotate_left(B, 30);
00131    }
00132 
00133 /*
00134 * SHA-160 F3 Function
00135 */
00136 inline void F3(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E, u32bit msg)
00137    {
00138    E += ((B & C) | ((B | C) & D)) + msg + rotate_left(A, 5);
00139    B  = rotate_left(B, 30);
00140    }
00141 
00142 /*
00143 * SHA-160 F4 Function
00144 */
00145 inline void F4(u32bit A, u32bit& B, u32bit C, u32bit D, u32bit& E, u32bit msg)
00146    {
00147    E += (B ^ C ^ D) + msg + rotate_left(A, 5);
00148    B  = rotate_left(B, 30);
00149    }
00150 
00151 }
00152 
00153 }
00154 
00155 /*
00156 * SHA-160 Compression Function using SSE for message expansion
00157 */
00158 void SHA_160_SSE2::compress_n(const byte input_bytes[], size_t blocks)
00159    {
00160    using namespace SHA1_SSE2_F;
00161 
00162    const __m128i K00_19 = _mm_set1_epi32(0x5A827999);
00163    const __m128i K20_39 = _mm_set1_epi32(0x6ED9EBA1);
00164    const __m128i K40_59 = _mm_set1_epi32(0x8F1BBCDC);
00165    const __m128i K60_79 = _mm_set1_epi32(0xCA62C1D6);
00166 
00167    u32bit A = digest[0],
00168           B = digest[1],
00169           C = digest[2],
00170           D = digest[3],
00171           E = digest[4];
00172 
00173    const __m128i* input = reinterpret_cast<const __m128i*>(input_bytes);
00174 
00175    for(size_t i = 0; i != blocks; ++i)
00176       {
00177       union v4si {
00178          u32bit u32[4];
00179          __m128i u128;
00180          };
00181 
00182       v4si P0, P1, P2, P3;
00183 
00184       __m128i W0 = _mm_loadu_si128(&input[0]);
00185       prep00_15(P0, W0);
00186 
00187       __m128i W1 = _mm_loadu_si128(&input[1]);
00188       prep00_15(P1, W1);
00189 
00190       __m128i W2 = _mm_loadu_si128(&input[2]);
00191       prep00_15(P2, W2);
00192 
00193       __m128i W3 = _mm_loadu_si128(&input[3]);
00194       prep00_15(P3, W3);
00195 
00196       /*
00197       Using SSE4; slower on Core2 and Nehalem
00198       #define GET_P_32(P, i) _mm_extract_epi32(P.u128, i)
00199 
00200       Much slower on all tested platforms
00201       #define GET_P_32(P,i) _mm_cvtsi128_si32(_mm_srli_si128(P.u128, i*4))
00202       */
00203 
00204 #define GET_P_32(P, i) P.u32[i]
00205 
00206       F1(A, B, C, D, E, GET_P_32(P0, 0));
00207       F1(E, A, B, C, D, GET_P_32(P0, 1));
00208       F1(D, E, A, B, C, GET_P_32(P0, 2));
00209       F1(C, D, E, A, B, GET_P_32(P0, 3));
00210       prep(P0, W0, W1, W2, W3, K00_19);
00211 
00212       F1(B, C, D, E, A, GET_P_32(P1, 0));
00213       F1(A, B, C, D, E, GET_P_32(P1, 1));
00214       F1(E, A, B, C, D, GET_P_32(P1, 2));
00215       F1(D, E, A, B, C, GET_P_32(P1, 3));
00216       prep(P1, W1, W2, W3, W0, K20_39);
00217 
00218       F1(C, D, E, A, B, GET_P_32(P2, 0));
00219       F1(B, C, D, E, A, GET_P_32(P2, 1));
00220       F1(A, B, C, D, E, GET_P_32(P2, 2));
00221       F1(E, A, B, C, D, GET_P_32(P2, 3));
00222       prep(P2, W2, W3, W0, W1, K20_39);
00223 
00224       F1(D, E, A, B, C, GET_P_32(P3, 0));
00225       F1(C, D, E, A, B, GET_P_32(P3, 1));
00226       F1(B, C, D, E, A, GET_P_32(P3, 2));
00227       F1(A, B, C, D, E, GET_P_32(P3, 3));
00228       prep(P3, W3, W0, W1, W2, K20_39);
00229 
00230       F1(E, A, B, C, D, GET_P_32(P0, 0));
00231       F1(D, E, A, B, C, GET_P_32(P0, 1));
00232       F1(C, D, E, A, B, GET_P_32(P0, 2));
00233       F1(B, C, D, E, A, GET_P_32(P0, 3));
00234       prep(P0, W0, W1, W2, W3, K20_39);
00235 
00236       F2(A, B, C, D, E, GET_P_32(P1, 0));
00237       F2(E, A, B, C, D, GET_P_32(P1, 1));
00238       F2(D, E, A, B, C, GET_P_32(P1, 2));
00239       F2(C, D, E, A, B, GET_P_32(P1, 3));
00240       prep(P1, W1, W2, W3, W0, K20_39);
00241 
00242       F2(B, C, D, E, A, GET_P_32(P2, 0));
00243       F2(A, B, C, D, E, GET_P_32(P2, 1));
00244       F2(E, A, B, C, D, GET_P_32(P2, 2));
00245       F2(D, E, A, B, C, GET_P_32(P2, 3));
00246       prep(P2, W2, W3, W0, W1, K40_59);
00247 
00248       F2(C, D, E, A, B, GET_P_32(P3, 0));
00249       F2(B, C, D, E, A, GET_P_32(P3, 1));
00250       F2(A, B, C, D, E, GET_P_32(P3, 2));
00251       F2(E, A, B, C, D, GET_P_32(P3, 3));
00252       prep(P3, W3, W0, W1, W2, K40_59);
00253 
00254       F2(D, E, A, B, C, GET_P_32(P0, 0));
00255       F2(C, D, E, A, B, GET_P_32(P0, 1));
00256       F2(B, C, D, E, A, GET_P_32(P0, 2));
00257       F2(A, B, C, D, E, GET_P_32(P0, 3));
00258       prep(P0, W0, W1, W2, W3, K40_59);
00259 
00260       F2(E, A, B, C, D, GET_P_32(P1, 0));
00261       F2(D, E, A, B, C, GET_P_32(P1, 1));
00262       F2(C, D, E, A, B, GET_P_32(P1, 2));
00263       F2(B, C, D, E, A, GET_P_32(P1, 3));
00264       prep(P1, W1, W2, W3, W0, K40_59);
00265 
00266       F3(A, B, C, D, E, GET_P_32(P2, 0));
00267       F3(E, A, B, C, D, GET_P_32(P2, 1));
00268       F3(D, E, A, B, C, GET_P_32(P2, 2));
00269       F3(C, D, E, A, B, GET_P_32(P2, 3));
00270       prep(P2, W2, W3, W0, W1, K40_59);
00271 
00272       F3(B, C, D, E, A, GET_P_32(P3, 0));
00273       F3(A, B, C, D, E, GET_P_32(P3, 1));
00274       F3(E, A, B, C, D, GET_P_32(P3, 2));
00275       F3(D, E, A, B, C, GET_P_32(P3, 3));
00276       prep(P3, W3, W0, W1, W2, K60_79);
00277 
00278       F3(C, D, E, A, B, GET_P_32(P0, 0));
00279       F3(B, C, D, E, A, GET_P_32(P0, 1));
00280       F3(A, B, C, D, E, GET_P_32(P0, 2));
00281       F3(E, A, B, C, D, GET_P_32(P0, 3));
00282       prep(P0, W0, W1, W2, W3, K60_79);
00283 
00284       F3(D, E, A, B, C, GET_P_32(P1, 0));
00285       F3(C, D, E, A, B, GET_P_32(P1, 1));
00286       F3(B, C, D, E, A, GET_P_32(P1, 2));
00287       F3(A, B, C, D, E, GET_P_32(P1, 3));
00288       prep(P1, W1, W2, W3, W0, K60_79);
00289 
00290       F3(E, A, B, C, D, GET_P_32(P2, 0));
00291       F3(D, E, A, B, C, GET_P_32(P2, 1));
00292       F3(C, D, E, A, B, GET_P_32(P2, 2));
00293       F3(B, C, D, E, A, GET_P_32(P2, 3));
00294       prep(P2, W2, W3, W0, W1, K60_79);
00295 
00296       F4(A, B, C, D, E, GET_P_32(P3, 0));
00297       F4(E, A, B, C, D, GET_P_32(P3, 1));
00298       F4(D, E, A, B, C, GET_P_32(P3, 2));
00299       F4(C, D, E, A, B, GET_P_32(P3, 3));
00300       prep(P3, W3, W0, W1, W2, K60_79);
00301 
00302       F4(B, C, D, E, A, GET_P_32(P0, 0));
00303       F4(A, B, C, D, E, GET_P_32(P0, 1));
00304       F4(E, A, B, C, D, GET_P_32(P0, 2));
00305       F4(D, E, A, B, C, GET_P_32(P0, 3));
00306 
00307       F4(C, D, E, A, B, GET_P_32(P1, 0));
00308       F4(B, C, D, E, A, GET_P_32(P1, 1));
00309       F4(A, B, C, D, E, GET_P_32(P1, 2));
00310       F4(E, A, B, C, D, GET_P_32(P1, 3));
00311 
00312       F4(D, E, A, B, C, GET_P_32(P2, 0));
00313       F4(C, D, E, A, B, GET_P_32(P2, 1));
00314       F4(B, C, D, E, A, GET_P_32(P2, 2));
00315       F4(A, B, C, D, E, GET_P_32(P2, 3));
00316 
00317       F4(E, A, B, C, D, GET_P_32(P3, 0));
00318       F4(D, E, A, B, C, GET_P_32(P3, 1));
00319       F4(C, D, E, A, B, GET_P_32(P3, 2));
00320       F4(B, C, D, E, A, GET_P_32(P3, 3));
00321 
00322       A = (digest[0] += A);
00323       B = (digest[1] += B);
00324       C = (digest[2] += C);
00325       D = (digest[3] += D);
00326       E = (digest[4] += E);
00327 
00328       input += (hash_block_size() / 16);
00329       }
00330 
00331 #undef GET_P_32
00332    }
00333 
00334 #undef prep00_15
00335 #undef prep
00336 
00337 }