/* * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic * James S. Plank, Ethan L. Miller, Kevin M. Greenan, * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. * * gf_w64.c * * Routines for 64-bit Galois fields */ #include "gf_int.h" #include #include #include "gf_w64.h" static inline gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a) { return gf->divide.w64(gf, 1, a); } #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); } static inline gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b) { b = gf->inverse.w64(gf, b); return gf->multiply.w64(gf, a, b); } static void gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { uint32_t i; gf_val_64_t *s64; gf_val_64_t *d64; s64 = (gf_val_64_t *) src; d64 = (gf_val_64_t *) dest; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } if (xor) { for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { d64[i] ^= gf->multiply.w64(gf, val, s64[i]); } } else { for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) { d64[i] = gf->multiply.w64(gf, val, s64[i]); } } } #if defined(INTEL_SSE4_PCLMUL) static void gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { gf_val_64_t *s64, *d64, *top; gf_region_data rd; __m128i a, b; __m128i result, r1; __m128i prim_poly; __m128i w; __m128i m1, m3, m4; gf_internal_t * h = gf->scratch; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); m3 = _mm_slli_si128(m1, 8); m4 = _mm_slli_si128(m3, 4); s64 = (gf_val_64_t *) rd.s_start; d64 = (gf_val_64_t *) rd.d_start; top = (gf_val_64_t *) rd.d_top; if (xor) { while (d64 != top) { a = _mm_load_si128((__m128i *) s64); result = _mm_clmulepi64_si128 (a, b, 1); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); r1 = _mm_xor_si128 (result, w); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); result = _mm_xor_si128 (result, w); result = _mm_unpacklo_epi64(result, r1); r1 = _mm_load_si128((__m128i *) d64); result = _mm_xor_si128(r1, result); _mm_store_si128((__m128i *) d64, result); d64 += 2; s64 += 2; } } else { while (d64 != top) { a = _mm_load_si128((__m128i *) s64); result = _mm_clmulepi64_si128 (a, b, 1); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); r1 = _mm_xor_si128 (result, w); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); result = _mm_xor_si128 (result, w); result = _mm_unpacklo_epi64(result, r1); _mm_store_si128((__m128i *) d64, result); d64 += 2; s64 += 2; } } gf_do_final_region_alignment(&rd); } #endif #if defined(INTEL_SSE4_PCLMUL) static void gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { gf_val_64_t *s64, *d64, *top; gf_region_data rd; __m128i a, b; __m128i result, r1; __m128i prim_poly; __m128i w; __m128i m1, m3, m4; gf_internal_t * h = gf->scratch; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0); m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff); m3 = _mm_slli_si128(m1, 8); m4 = _mm_slli_si128(m3, 4); s64 = (gf_val_64_t *) rd.s_start; d64 = (gf_val_64_t *) rd.d_start; top = (gf_val_64_t *) rd.d_top; if (xor) { while (d64 != top) { a = _mm_load_si128((__m128i *) s64); result = _mm_clmulepi64_si128 (a, b, 1); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); r1 = _mm_xor_si128 (result, w); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); result = _mm_xor_si128 (result, w); result = _mm_unpacklo_epi64(result, r1); r1 = _mm_load_si128((__m128i *) d64); result = _mm_xor_si128(r1, result); _mm_store_si128((__m128i *) d64, result); d64 += 2; s64 += 2; } } else { while (d64 != top) { a = _mm_load_si128((__m128i *) s64); result = _mm_clmulepi64_si128 (a, b, 1); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); r1 = _mm_xor_si128 (result, w); result = _mm_clmulepi64_si128 (a, b, 0); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1); result = _mm_xor_si128 (result, w); w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1); result = _mm_xor_si128 (result, w); result = _mm_unpacklo_epi64(result, r1); _mm_store_si128((__m128i *) d64, result); d64 += 2; s64 += 2; } } gf_do_final_region_alignment(&rd); } #endif static inline gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b) { gf_val_64_t e_i, e_im1, e_ip1; gf_val_64_t d_i, d_im1, d_ip1; gf_val_64_t y_i, y_im1, y_ip1; gf_val_64_t c_i; gf_val_64_t one = 1; if (b == 0) return -1; e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly; e_i = b; d_im1 = 64; for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ; y_i = 1; y_im1 = 0; while (e_i != 1) { e_ip1 = e_im1; d_ip1 = d_im1; c_i = 0; while (d_ip1 >= d_i) { c_i ^= (one << (d_ip1 - d_i)); e_ip1 ^= (e_i << (d_ip1 - d_i)); d_ip1--; if (e_ip1 == 0) return 0; while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--; } y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i); y_im1 = y_i; y_i = y_ip1; e_im1 = e_i; d_im1 = d_i; e_i = e_ip1; d_i = d_ip1; } return y_i; } /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only include it for completeness. It does have the feature that it requires no extra memory. */ static inline gf_val_64_t gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; /* Allen: set leading one of primitive polynomial */ a = a64; bl = 0; br = b64; one = 1; lbit = (one << 63); pl = 0; /* Allen: left side of product */ pr = 0; /* Allen: right side of product */ /* Allen: unlike the corresponding functions for smaller word sizes, * this loop carries out the initial carryless multiply by * shifting b itself rather than simply looking at successively * higher shifts of b */ for (i = 0; i < GF_FIELD_WIDTH; i++) { if (a & (one << i)) { pl ^= bl; pr ^= br; } bl <<= 1; if (br & lbit) bl ^= 1; br <<= 1; } /* Allen: the name of the variable "one" is no longer descriptive at this point */ one = lbit >> 1; ppl = (h->prim_poly >> 2) | one; ppr = (h->prim_poly << (GF_FIELD_WIDTH-2)); while (one != 0) { if (pl & one) { pl ^= ppl; pr ^= ppr; } one >>= 1; ppr >>= 1; if (ppl & 1) ppr ^= lbit; ppl >>= 1; } return pr; } /* * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply. */ static inline gf_val_64_t gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { gf_val_64_t rv = 0; #if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; __m128i v, w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); b = _mm_insert_epi64 (a, b64, 0); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); /* Do the initial multiply */ result = _mm_clmulepi64_si128 (a, b, 0); /* Mask off the high order 32 bits using subtraction of the polynomial. * NOTE: this part requires that the polynomial have at least 32 leading 0 bits. */ /* Adam: We cant include the leading one in the 64 bit pclmul, so we need to split up the high 8 bytes of the result into two parts before we multiply them with the prim_poly.*/ v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); #endif return rv; } static inline gf_val_64_t gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64) { gf_val_64_t rv = 0; #if defined(INTEL_SSE4_PCLMUL) __m128i a, b; __m128i result; __m128i prim_poly; __m128i v, w; gf_internal_t * h = gf->scratch; a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0); b = _mm_insert_epi64 (a, b64, 0); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); /* Do the initial multiply */ result = _mm_clmulepi64_si128 (a, b, 0); v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, v, 0); result = _mm_xor_si128 (result, w); rv = ((gf_val_64_t)_mm_extract_epi64(result, 0)); #endif return rv; } void gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { #if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; uint8_t *s8, *d8, *dtop; gf_region_data rd; __m128i v, b, m, prim_poly, c, fr, w, result; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; dtop = (uint8_t *) rd.d_top; v = _mm_insert_epi64(_mm_setzero_si128(), val, 0); m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff); prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL)); if (xor) { while (d8 != dtop) { b = _mm_load_si128((__m128i *) s8); result = _mm_clmulepi64_si128 (b, v, 0); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, c, 0); result = _mm_xor_si128 (result, w); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, c, 0); fr = _mm_xor_si128 (result, w); fr = _mm_and_si128 (fr, m); result = _mm_clmulepi64_si128 (b, v, 1); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, c, 0); result = _mm_xor_si128 (result, w); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, c, 0); result = _mm_xor_si128 (result, w); result = _mm_slli_si128 (result, 8); fr = _mm_xor_si128 (result, fr); result = _mm_load_si128((__m128i *) d8); fr = _mm_xor_si128 (result, fr); _mm_store_si128((__m128i *) d8, fr); d8 += 16; s8 += 16; } } else { while (d8 < dtop) { b = _mm_load_si128((__m128i *) s8); result = _mm_clmulepi64_si128 (b, v, 0); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, c, 0); result = _mm_xor_si128 (result, w); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, c, 0); fr = _mm_xor_si128 (result, w); fr = _mm_and_si128 (fr, m); result = _mm_clmulepi64_si128 (b, v, 1); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0); w = _mm_clmulepi64_si128 (prim_poly, c, 0); result = _mm_xor_si128 (result, w); c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1); w = _mm_clmulepi64_si128 (prim_poly, c, 0); result = _mm_xor_si128 (result, w); result = _mm_slli_si128 (result, 8); fr = _mm_xor_si128 (result, fr); _mm_store_si128((__m128i *) d8, fr); d8 += 16; s8 += 16; } } gf_do_final_region_alignment(&rd); #endif } void gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { gf_internal_t *h; struct gf_split_4_64_lazy_data *ld; int i, j, k; uint64_t pp, v, s, *s64, *d64, *top; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; ld = (struct gf_split_4_64_lazy_data *) h->private; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); gf_do_initial_region_alignment(&rd); if (ld->last_value != val) { v = val; for (i = 0; i < 16; i++) { ld->tables[i][0] = 0; for (j = 1; j < 16; j <<= 1) { for (k = 0; k < j; k++) { ld->tables[i][k^j] = (v ^ ld->tables[i][k]); } v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); } } } ld->last_value = val; s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; while (d64 != top) { v = (xor) ? *d64 : 0; s = *s64; i = 0; while (s != 0) { v ^= ld->tables[i][s&0xf]; s >>= 4; i++; } *d64 = v; d64++; s64++; } gf_do_final_region_alignment(&rd); } static inline uint64_t gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64) { uint64_t product, i, j, mask, tb; gf_internal_t *h; struct gf_split_8_8_data *d8; h = (gf_internal_t *) gf->scratch; d8 = (struct gf_split_8_8_data *) h->private; product = 0; mask = 0xff; for (i = 0; a64 != 0; i++) { tb = b64; for (j = 0; tb != 0; j++) { product ^= d8->tables[i+j][a64&mask][tb&mask]; tb >>= 8; } a64 >>= 8; } return product; } void gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { gf_internal_t *h; struct gf_split_8_64_lazy_data *ld; int i, j, k; uint64_t pp, v, s, *s64, *d64, *top; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; ld = (struct gf_split_8_64_lazy_data *) h->private; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); gf_do_initial_region_alignment(&rd); if (ld->last_value != val) { v = val; for (i = 0; i < 8; i++) { ld->tables[i][0] = 0; for (j = 1; j < 256; j <<= 1) { for (k = 0; k < j; k++) { ld->tables[i][k^j] = (v ^ ld->tables[i][k]); } v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); } } } ld->last_value = val; s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; while (d64 != top) { v = (xor) ? *d64 : 0; s = *s64; i = 0; while (s != 0) { v ^= ld->tables[i][s&0xff]; s >>= 8; i++; } *d64 = v; d64++; s64++; } gf_do_final_region_alignment(&rd); } void gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { gf_internal_t *h; struct gf_split_16_64_lazy_data *ld; int i, j, k; uint64_t pp, v, s, *s64, *d64, *top; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; ld = (struct gf_split_16_64_lazy_data *) h->private; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); gf_do_initial_region_alignment(&rd); if (ld->last_value != val) { v = val; for (i = 0; i < 4; i++) { ld->tables[i][0] = 0; for (j = 1; j < (1<<16); j <<= 1) { for (k = 0; k < j; k++) { ld->tables[i][k^j] = (v ^ ld->tables[i][k]); } v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); } } } ld->last_value = val; s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; while (d64 != top) { v = (xor) ? *d64 : 0; s = *s64; i = 0; while (s != 0) { v ^= ld->tables[i][s&0xffff]; s >>= 16; i++; } *d64 = v; d64++; s64++; } gf_do_final_region_alignment(&rd); } static int gf_w64_shift_init(gf_t *gf) { gf->multiply.w64 = gf_w64_shift_multiply; gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; return 1; } static int gf_w64_cfm_init(gf_t *gf) { gf->inverse.w64 = gf_w64_euclid; gf->multiply_region.w64 = gf_w64_multiply_region_from_single; #if defined(INTEL_SSE4_PCLMUL) gf_internal_t *h; h = (gf_internal_t *) gf->scratch; if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ gf->multiply.w64 = gf_w64_clm_multiply_2; gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ gf->multiply.w64 = gf_w64_clm_multiply_4; gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; } else { return 0; } return 1; #endif return 0; } static void gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h) { uint64_t i; uint64_t j; uint64_t one = 1; int g_s; g_s = h->arg1; shift[0] = 0; for (i = 1; i < ((uint64_t)1 << g_s); i <<= 1) { for (j = 0; j < i; j++) shift[i|j] = shift[j]^val; if (val & (one << 63)) { val <<= 1; val ^= h->prim_poly; } else { val <<= 1; } } } static inline gf_val_64_t gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { uint64_t top, bot, mask, tp; int g_s, g_r, lshift, rshift; struct gf_w64_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; g_s = h->arg1; g_r = h->arg2; gd = (struct gf_w64_group_data *) h->private; gf_w64_group_set_shift_tables(gd->shift, b, h); mask = (((uint64_t)1 << g_s) - 1); top = 0; bot = gd->shift[a&mask]; a >>= g_s; if (a == 0) return bot; lshift = 0; rshift = 64; do { /* Shifting out is straightfoward */ lshift += g_s; rshift -= g_s; tp = gd->shift[a&mask]; top ^= (tp >> rshift); bot ^= (tp << lshift); a >>= g_s; } while (a != 0); /* Reducing is a bit gross, because I don't zero out the index bits of top. The reason is that we throw top away. Even better, that last (tp >> rshift) is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */ lshift = ((lshift-1) / g_r) * g_r; rshift = 64 - lshift; mask = ((uint64_t)1 << g_r) - 1; while (lshift >= 0) { tp = gd->reduce[(top >> lshift) & mask]; top ^= (tp >> rshift); bot ^= (tp << lshift); lshift -= g_r; rshift += g_r; } return bot; } static void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { int i, fzb; uint64_t a64, smask, rmask, top, bot, tp; int lshift, rshift, g_s, g_r; gf_region_data rd; uint64_t *s64, *d64, *dtop; struct gf_w64_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gd = (struct gf_w64_group_data *) h->private; g_s = h->arg1; g_r = h->arg2; gf_w64_group_set_shift_tables(gd->shift, val, h); for (i = 63; !(val & (1ULL << i)); i--) ; i += g_s; /* i is the bit position of the first zero bit in any element of gd->shift[] */ if (i > 64) i = 64; fzb = i; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); gf_do_initial_region_alignment(&rd); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; dtop = (uint64_t *) rd.d_top; smask = ((uint64_t)1 << g_s) - 1; rmask = ((uint64_t)1 << g_r) - 1; while (d64 < dtop) { a64 = *s64; top = 0; bot = gd->shift[a64&smask]; a64 >>= g_s; i = fzb; if (a64 != 0) { lshift = 0; rshift = 64; do { lshift += g_s; rshift -= g_s; tp = gd->shift[a64&smask]; top ^= (tp >> rshift); bot ^= (tp << lshift); a64 >>= g_s; } while (a64 != 0); i += lshift; lshift = ((i-64-1) / g_r) * g_r; rshift = 64 - lshift; while (lshift >= 0) { tp = gd->reduce[(top >> lshift) & rmask]; top ^= (tp >> rshift); bot ^= (tp << lshift); lshift -= g_r; rshift += g_r; } } if (xor) bot ^= *d64; *d64 = bot; d64++; s64++; } gf_do_final_region_alignment(&rd); } static inline gf_val_64_t gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { int leftover, rs; uint64_t p, l, ind, a64; int bits_left; int g_s; struct gf_w64_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; g_s = h->arg1; gd = (struct gf_w64_group_data *) h->private; gf_w64_group_set_shift_tables(gd->shift, b, h); leftover = 64 % g_s; if (leftover == 0) leftover = g_s; rs = 64 - leftover; a64 = a; ind = a64 >> rs; a64 <<= leftover; p = gd->shift[ind]; bits_left = rs; rs = 64 - g_s; while (bits_left > 0) { bits_left -= g_s; ind = a64 >> rs; a64 <<= g_s; l = p >> rs; p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); } return p; } static void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { int leftover, rs; uint64_t p, l, ind, a64; int bits_left; int g_s; gf_region_data rd; uint64_t *s64, *d64, *top; struct gf_w64_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gd = (struct gf_w64_group_data *) h->private; g_s = h->arg1; gf_w64_group_set_shift_tables(gd->shift, val, h); gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4); gf_do_initial_region_alignment(&rd); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; leftover = 64 % g_s; if (leftover == 0) leftover = g_s; while (d64 < top) { rs = 64 - leftover; a64 = *s64; ind = a64 >> rs; a64 <<= leftover; p = gd->shift[ind]; bits_left = rs; rs = 64 - g_s; while (bits_left > 0) { bits_left -= g_s; ind = a64 >> rs; a64 <<= g_s; l = p >> rs; p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s)); } if (xor) p ^= *d64; *d64 = p; d64++; s64++; } gf_do_final_region_alignment(&rd); } static int gf_w64_group_init(gf_t *gf) { uint64_t i, j, p, index; struct gf_w64_group_data *gd; gf_internal_t *h = (gf_internal_t *) gf->scratch; uint64_t g_r, g_s; g_s = h->arg1; g_r = h->arg2; gd = (struct gf_w64_group_data *) h->private; gd->shift = (uint64_t *) (&(gd->memory)); gd->reduce = gd->shift + (1 << g_s); gd->reduce[0] = 0; for (i = 0; i < ((uint64_t)1 << g_r); i++) { p = 0; index = 0; for (j = 0; j < g_r; j++) { if (i & (1 << j)) { p ^= (h->prim_poly << j); index ^= (1 << j); if (j > 0) index ^= (h->prim_poly >> (64-j)); } } gd->reduce[index] = p; } if (g_s == g_r) { gf->multiply.w64 = gf_w64_group_s_equals_r_multiply; gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region; } else { gf->multiply.w64 = gf_w64_group_multiply; gf->multiply_region.w64 = gf_w64_group_multiply_region; } gf->divide.w64 = NULL; gf->inverse.w64 = gf_w64_euclid; return 1; } static gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index) { uint64_t *r64, rv; r64 = (uint64_t *) start; rv = r64[index]; return rv; } static gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index) { int sub_size; gf_internal_t *h; uint8_t *r8, *top; uint64_t a, b, *r64; gf_region_data rd; h = (gf_internal_t *) gf->scratch; gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32); r64 = (uint64_t *) start; if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; index -= (((uint64_t *) rd.d_start) - r64); r8 = (uint8_t *) rd.d_start; top = (uint8_t *) rd.d_top; sub_size = (top-r8)/2; a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index); b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index); return (a | ((uint64_t)b << 32)); } static gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index) { int i; uint64_t *r64, rv; uint8_t *r8; gf_region_data rd; gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128); r64 = (uint64_t *) start; if (r64 + index < (uint64_t *) rd.d_start) return r64[index]; if (r64 + index >= (uint64_t *) rd.d_top) return r64[index]; index -= (((uint64_t *) rd.d_start) - r64); r8 = (uint8_t *) rd.d_start; r8 += ((index & 0xfffffff0)*8); r8 += (index & 0xf); r8 += 112; rv =0; for (i = 0; i < 8; i++) { rv <<= 8; rv |= *r8; r8 -= 16; } return rv; } static inline gf_val_64_t gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) { uint64_t prod, pp, bmask; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; prod = 0; bmask = 0x8000000000000000ULL; while (1) { if (a & 1) prod ^= b; a >>= 1; if (a == 0) return prod; if (b & bmask) { b = ((b << 1) ^ pp); } else { b <<= 1; } } } static inline gf_val_64_t gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b) { uint64_t prod, pp, pmask, amask; gf_internal_t *h; h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; prod = 0; /* changed from declare then shift to just declare.*/ pmask = 0x8000000000000000ULL; amask = 0x8000000000000000ULL; while (amask != 0) { if (prod & pmask) { prod = ((prod << 1) ^ pp); } else { prod <<= 1; } if (a & amask) prod ^= b; amask >>= 1; } return prod; } static void gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { uint64_t *s64, *d64, ta, prod, amask, pmask, pp; gf_region_data rd; gf_internal_t *h; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; pmask = 0x80000000; pmask <<= 32; pp = h->prim_poly; if (xor) { while (s64 < (uint64_t *) rd.s_top) { prod = 0; amask = pmask; ta = *s64; while (amask != 0) { prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); if (val & amask) prod ^= ta; amask >>= 1; } *d64 ^= prod; d64++; s64++; } } else { while (s64 < (uint64_t *) rd.s_top) { prod = 0; amask = pmask; ta = *s64; while (amask != 0) { prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1); if (val & amask) prod ^= ta; amask >>= 1; } *d64 = prod; d64++; s64++; } } gf_do_final_region_alignment(&rd); } static void gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { uint64_t *s64, *d64, ta, tb, prod, bmask, pp; gf_region_data rd; gf_internal_t *h; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; bmask = 0x80000000; bmask <<= 32; pp = h->prim_poly; if (xor) { while (s64 < (uint64_t *) rd.s_top) { prod = 0; tb = val; ta = *s64; while (1) { if (tb & 1) prod ^= ta; tb >>= 1; if (tb == 0) break; ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); } *d64 ^= prod; d64++; s64++; } } else { while (s64 < (uint64_t *) rd.s_top) { prod = 0; tb = val; ta = *s64; while (1) { if (tb & 1) prod ^= ta; tb >>= 1; if (tb == 0) break; ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1); } *d64 = prod; d64++; s64++; } } gf_do_final_region_alignment(&rd); } #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\ t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \ t2 = _mm_and_si128(va, m2); \ t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \ va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); } #define BYTWO_P_ONESTEP {\ SSE_AB2(pp, m1 ,m2, prod, t1, t2); \ t1 = _mm_and_si128(v, one); \ t1 = _mm_sub_epi64(t1, one); \ t1 = _mm_and_si128(t1, ta); \ prod = _mm_xor_si128(prod, t1); \ v = _mm_srli_epi64(v, 1); } void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { #ifdef INTEL_SSE2 int i; uint8_t *s8, *d8; uint64_t vrev, one64; uint64_t amask; __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v; gf_region_data rd; gf_internal_t *h; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); h = (gf_internal_t *) gf->scratch; one64 = 1; vrev = 0; for (i = 0; i < 64; i++) { vrev <<= 1; if (!(val & (one64 << i))) vrev |= 1; } s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; amask = -1; amask ^= 1; pp = _mm_set1_epi64x(h->prim_poly); m1 = _mm_set1_epi64x(amask); m2 = _mm_set1_epi64x(one64 << 63); one = _mm_set1_epi64x(1); while (d8 < (uint8_t *) rd.d_top) { prod = _mm_setzero_si128(); v = _mm_set1_epi64x(vrev); ta = _mm_load_si128((__m128i *) s8); tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8); BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp)); d8 += 16; s8 += 16; } gf_do_final_region_alignment(&rd); #endif } #ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd) { uint64_t one64, amask; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; gf_internal_t *h; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; h = (gf_internal_t *) rd->gf->scratch; one64 = 1; amask = -1; amask ^= 1; pp = _mm_set1_epi64x(h->prim_poly); m1 = _mm_set1_epi64x(amask); m2 = _mm_set1_epi64x(one64 << 63); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); SSE_AB2(pp, m1, m2, va, t1, t2); vb = _mm_load_si128 ((__m128i *)(d8)); vb = _mm_xor_si128(vb, va); _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } } #endif #ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd) { uint64_t one64, amask; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va; gf_internal_t *h; s8 = (uint8_t *) rd->s_start; d8 = (uint8_t *) rd->d_start; h = (gf_internal_t *) rd->gf->scratch; one64 = 1; amask = -1; amask ^= 1; pp = _mm_set1_epi64x(h->prim_poly); m1 = _mm_set1_epi64x(amask); m2 = _mm_set1_epi64x(one64 << 63); while (d8 < (uint8_t *) rd->d_top) { va = _mm_load_si128 ((__m128i *)(s8)); SSE_AB2(pp, m1, m2, va, t1, t2); _mm_store_si128((__m128i *)d8, va); d8 += 16; s8 += 16; } } #endif #ifdef INTEL_SSE2 static void gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { uint64_t itb, amask, one64; uint8_t *d8, *s8; __m128i pp, m1, m2, t1, t2, va, vb; gf_region_data rd; gf_internal_t *h; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); gf_do_initial_region_alignment(&rd); if (val == 2) { if (xor) { gf_w64_bytwo_b_sse_region_2_xor(&rd); } else { gf_w64_bytwo_b_sse_region_2_noxor(&rd); } gf_do_final_region_alignment(&rd); return; } s8 = (uint8_t *) rd.s_start; d8 = (uint8_t *) rd.d_start; h = (gf_internal_t *) gf->scratch; one64 = 1; amask = -1; amask ^= 1; pp = _mm_set1_epi64x(h->prim_poly); m1 = _mm_set1_epi64x(amask); m2 = _mm_set1_epi64x(one64 << 63); while (d8 < (uint8_t *) rd.d_top) { va = _mm_load_si128 ((__m128i *)(s8)); vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8)); itb = val; while (1) { if (itb & 1) vb = _mm_xor_si128(vb, va); itb >>= 1; if (itb == 0) break; SSE_AB2(pp, m1, m2, va, t1, t2); } _mm_store_si128((__m128i *)d8, vb); d8 += 16; s8 += 16; } gf_do_final_region_alignment(&rd); } #endif static int gf_w64_bytwo_init(gf_t *gf) { gf_internal_t *h; h = (gf_internal_t *) gf->scratch; if (h->mult_type == GF_MULT_BYTWO_p) { gf->multiply.w64 = gf_w64_bytwo_p_multiply; #ifdef INTEL_SSE2 if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; else gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; #else gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; if(h->region_type & GF_REGION_SIMD) return 0; #endif } else { gf->multiply.w64 = gf_w64_bytwo_b_multiply; #ifdef INTEL_SSE2 if (h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; else gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; #else gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; if(h->region_type & GF_REGION_SIMD) return 0; #endif } gf->inverse.w64 = gf_w64_euclid; return 1; } static gf_val_64_t gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint32_t b0 = b & 0x00000000ffffffff; uint32_t b1 = (b & 0xffffffff00000000) >> 32; uint32_t a0 = a & 0x00000000ffffffff; uint32_t a1 = (a & 0xffffffff00000000) >> 32; uint32_t a1b1; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); } /* * Composite field division trick (explained in 2007 tech report) * * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1 * * let c = b^-1 * * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0) * * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1 * * let d = b1c1 and d+1 = b0c0 * * solve s*b1c1+b1c0+b0c1 = 0 * * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1 * * c0 = (d+1)b0^-1 * c1 = d*b1^-1 * * a / b = a * c */ static gf_val_64_t gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint32_t a0 = a & 0x00000000ffffffff; uint32_t a1 = (a & 0xffffffff00000000) >> 32; uint32_t c0, c1, d, tmp; uint64_t c; uint32_t a0inv, a1inv; if (a0 == 0) { a1inv = base_gf->inverse.w32(base_gf, a1); c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly); c1 = a1inv; } else if (a1 == 0) { c0 = base_gf->inverse.w32(base_gf, a0); c1 = 0; } else { a1inv = base_gf->inverse.w32(base_gf, a1); a0inv = base_gf->inverse.w32(base_gf, a0); d = base_gf->multiply.w32(base_gf, a1, a0inv); tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly); tmp = base_gf->inverse.w32(base_gf, tmp); d = base_gf->multiply.w32(base_gf, d, tmp); c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv); c1 = base_gf->multiply.w32(base_gf, d, a1inv); } c = c0 | ((uint64_t)c1 << 32); return c; } static void gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; uint32_t b0 = val & 0x00000000ffffffff; uint32_t b1 = (val & 0xffffffff00000000) >> 32; uint64_t *s64, *d64; uint64_t *top; uint64_t a0, a1, a1b1; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8); s64 = rd.s_start; d64 = rd.d_start; top = rd.d_top; if (xor) { while (d64 < top) { a0 = *s64 & 0x00000000ffffffff; a1 = (*s64 & 0xffffffff00000000) >> 32; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); s64++; d64++; } } else { while (d64 < top) { a0 = *s64 & 0x00000000ffffffff; a1 = (*s64 & 0xffffffff00000000) >> 32; a1b1 = base_gf->multiply.w32(base_gf, a1, b1); *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32)); s64++; d64++; } } } static void gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor) { gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf; gf_val_32_t val0 = val & 0x00000000ffffffff; gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32; uint8_t *slow, *shigh; uint8_t *dlow, *dhigh, *top; int sub_reg_size; gf_region_data rd; if (!xor) { memset(dest, 0, bytes); } gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32); gf_do_initial_region_alignment(&rd); slow = (uint8_t *) rd.s_start; dlow = (uint8_t *) rd.d_start; top = (uint8_t*) rd.d_top; sub_reg_size = (top - dlow)/2; shigh = slow + sub_reg_size; dhigh = dlow + sub_reg_size; base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor); base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1); base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1); gf_do_final_region_alignment(&rd); } static int gf_w64_composite_init(gf_t *gf) { gf_internal_t *h = (gf_internal_t *) gf->scratch; if (h->region_type & GF_REGION_ALTMAP) { gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt; } else { gf->multiply_region.w64 = gf_w64_composite_multiply_region; } gf->multiply.w64 = gf_w64_composite_multiply; gf->divide.w64 = NULL; gf->inverse.w64 = gf_w64_composite_inverse; return 1; } #ifdef INTEL_SSSE3 static void gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { gf_internal_t *h; int i, j, k; uint64_t pp, v, *s64, *d64, *top; __m128i si, tables[16][8], p[8], v0, mask1; struct gf_split_4_64_lazy_data *ld; uint8_t btable[16]; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); gf_do_initial_region_alignment(&rd); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; ld = (struct gf_split_4_64_lazy_data *) h->private; v = val; for (i = 0; i < 16; i++) { ld->tables[i][0] = 0; for (j = 1; j < 16; j <<= 1) { for (k = 0; k < j; k++) { ld->tables[i][k^j] = (v ^ ld->tables[i][k]); } v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); } for (j = 0; j < 8; j++) { for (k = 0; k < 16; k++) { btable[k] = (uint8_t) ld->tables[i][k]; ld->tables[i][k] >>= 8; } tables[i][j] = _mm_loadu_si128((__m128i *) btable); } } mask1 = _mm_set1_epi8(0xf); while (d64 != top) { if (xor) { for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2)); } else { for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); } i = 0; for (k = 0; k < 8; k++) { v0 = _mm_load_si128((__m128i *) s64); /* MM_PRINT8("v", v0); */ s64 += 2; si = _mm_and_si128(v0, mask1); for (j = 0; j < 8; j++) { p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); } i++; v0 = _mm_srli_epi32(v0, 4); si = _mm_and_si128(v0, mask1); for (j = 0; j < 8; j++) { p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); } i++; } for (i = 0; i < 8; i++) { /* MM_PRINT8("v", p[i]); */ _mm_store_si128((__m128i *) d64, p[i]); d64 += 2; } } gf_do_final_region_alignment(&rd); } #endif #ifdef INTEL_SSE4 static void gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor) { gf_internal_t *h; int i, j, k; uint64_t pp, v, *s64, *d64, *top; __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1; struct gf_split_4_64_lazy_data *ld; uint8_t btable[16]; gf_region_data rd; if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } h = (gf_internal_t *) gf->scratch; pp = h->prim_poly; gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); gf_do_initial_region_alignment(&rd); s64 = (uint64_t *) rd.s_start; d64 = (uint64_t *) rd.d_start; top = (uint64_t *) rd.d_top; ld = (struct gf_split_4_64_lazy_data *) h->private; v = val; for (i = 0; i < 16; i++) { ld->tables[i][0] = 0; for (j = 1; j < 16; j <<= 1) { for (k = 0; k < j; k++) { ld->tables[i][k^j] = (v ^ ld->tables[i][k]); } v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); } for (j = 0; j < 8; j++) { for (k = 0; k < 16; k++) { btable[k] = (uint8_t) ld->tables[i][k]; ld->tables[i][k] >>= 8; } tables[i][j] = _mm_loadu_si128((__m128i *) btable); } } mask1 = _mm_set1_epi8(0xf); mask8 = _mm_set1_epi16(0xff); mask16 = _mm_set1_epi32(0xffff); while (d64 != top) { for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128(); for (k = 0; k < 8; k++) { st[k] = _mm_load_si128((__m128i *) s64); s64 += 2; } for (k = 0; k < 4; k ++) { st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0)); st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1)); t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0); st[k] = _mm_srli_si128(st[k], 8); st[k+4] = _mm_slli_si128(st[k+4], 8); st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0); st[k] = t1; } /* printf("After pack pass 1\n"); for (k = 0; k < 8; k++) { MM_PRINT8("v", st[k]); } printf("\n"); */ t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16)); st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16)); st[0] = t1; t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16)); st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16)); st[1] = t1; t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16)); st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16)); st[4] = t1; t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16)); st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16)); st[5] = t1; /* printf("After pack pass 2\n"); for (k = 0; k < 8; k++) { MM_PRINT8("v", st[k]); } printf("\n"); */ t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8)); st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8)); st[0] = t1; t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8)); st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8)); st[2] = t1; t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8)); st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8)); st[4] = t1; t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8)); st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8)); st[6] = t1; /* printf("After final pack pass 2\n"); for (k = 0; k < 8; k++) { MM_PRINT8("v", st[k]); } */ i = 0; for (k = 0; k < 8; k++) { si = _mm_and_si128(st[k], mask1); for (j = 0; j < 8; j++) { p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); } i++; st[k] = _mm_srli_epi32(st[k], 4); si = _mm_and_si128(st[k], mask1); for (j = 0; j < 8; j++) { p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si)); } i++; } t1 = _mm_unpacklo_epi8(p[0], p[1]); p[1] = _mm_unpackhi_epi8(p[0], p[1]); p[0] = t1; t1 = _mm_unpacklo_epi8(p[2], p[3]); p[3] = _mm_unpackhi_epi8(p[2], p[3]); p[2] = t1; t1 = _mm_unpacklo_epi8(p[4], p[5]); p[5] = _mm_unpackhi_epi8(p[4], p[5]); p[4] = t1; t1 = _mm_unpacklo_epi8(p[6], p[7]); p[7] = _mm_unpackhi_epi8(p[6], p[7]); p[6] = t1; /* printf("After unpack pass 1:\n"); for (i = 0; i < 8; i++) { MM_PRINT8("v", p[i]); } */ t1 = _mm_unpacklo_epi16(p[0], p[2]); p[2] = _mm_unpackhi_epi16(p[0], p[2]); p[0] = t1; t1 = _mm_unpacklo_epi16(p[1], p[3]); p[3] = _mm_unpackhi_epi16(p[1], p[3]); p[1] = t1; t1 = _mm_unpacklo_epi16(p[4], p[6]); p[6] = _mm_unpackhi_epi16(p[4], p[6]); p[4] = t1; t1 = _mm_unpacklo_epi16(p[5], p[7]); p[7] = _mm_unpackhi_epi16(p[5], p[7]); p[5] = t1; /* printf("After unpack pass 2:\n"); for (i = 0; i < 8; i++) { MM_PRINT8("v", p[i]); } */ t1 = _mm_unpacklo_epi32(p[0], p[4]); p[4] = _mm_unpackhi_epi32(p[0], p[4]); p[0] = t1; t1 = _mm_unpacklo_epi32(p[1], p[5]); p[5] = _mm_unpackhi_epi32(p[1], p[5]); p[1] = t1; t1 = _mm_unpacklo_epi32(p[2], p[6]); p[6] = _mm_unpackhi_epi32(p[2], p[6]); p[2] = t1; t1 = _mm_unpacklo_epi32(p[3], p[7]); p[7] = _mm_unpackhi_epi32(p[3], p[7]); p[3] = t1; if (xor) { for (i = 0; i < 8; i++) { t1 = _mm_load_si128((__m128i *) d64); _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1)); d64 += 2; } } else { for (i = 0; i < 8; i++) { _mm_store_si128((__m128i *) d64, p[i]); d64 += 2; } } } gf_do_final_region_alignment(&rd); } #endif #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1); static int gf_w64_split_init(gf_t *gf) { gf_internal_t *h; struct gf_split_4_64_lazy_data *d4; struct gf_split_8_64_lazy_data *d8; struct gf_split_8_8_data *d88; struct gf_split_16_64_lazy_data *d16; uint64_t p, basep; int exp, i, j; h = (gf_internal_t *) gf->scratch; /* Defaults */ gf->multiply_region.w64 = gf_w64_multiply_region_from_single; gf->multiply.w64 = gf_w64_bytwo_p_multiply; #if defined(INTEL_SSE4_PCLMUL) if ((!(h->region_type & GF_REGION_NOSIMD) && (h->arg1 == 64 || h->arg2 == 64)) || h->mult_type == GF_MULT_DEFAULT){ if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ gf->multiply.w64 = gf_w64_clm_multiply_2; gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; }else if((0xfffe000000000000ULL & h->prim_poly) == 0){ gf->multiply.w64 = gf_w64_clm_multiply_4; gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; }else{ return 0; } } #endif gf->inverse.w64 = gf_w64_euclid; /* Allen: set region pointers for default mult type. Single pointers are * taken care of above (explicitly for sse, implicitly for no sse). */ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) if (h->mult_type == GF_MULT_DEFAULT) { d4 = (struct gf_split_4_64_lazy_data *) h->private; d4->last_value = 0; #if defined(INTEL_SSE4) gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; #elif defined(ARCH_AARCH64) gf_w64_neon_split_init(gf); #endif } #else if (h->mult_type == GF_MULT_DEFAULT) { d8 = (struct gf_split_8_64_lazy_data *) h->private; d8->last_value = 0; gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; } #endif if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) { d4 = (struct gf_split_4_64_lazy_data *) h->private; d4->last_value = 0; if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0; if(h->region_type & GF_REGION_ALTMAP) { #ifdef INTEL_SSSE3 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; #elif defined(ARCH_AARCH64) gf_w64_neon_split_init(gf); #else return 0; #endif } else //no altmap { #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) if(h->region_type & GF_REGION_NOSIMD) gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; else #if defined(INTEL_SSE4) gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; #elif defined(ARCH_AARCH64) gf_w64_neon_split_init(gf); #endif #else gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; if(h->region_type & GF_REGION_SIMD) return 0; #endif } } if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) { d8 = (struct gf_split_8_64_lazy_data *) h->private; d8->last_value = 0; gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region; } if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) { d16 = (struct gf_split_16_64_lazy_data *) h->private; d16->last_value = 0; gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region; } if ((h->arg1 == 8 && h->arg2 == 8)) { d88 = (struct gf_split_8_8_data *) h->private; gf->multiply.w64 = gf_w64_split_8_8_multiply; /* The performance of this guy sucks, so don't bother with a region op */ basep = 1; for (exp = 0; exp < 15; exp++) { for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0; for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0; d88->tables[exp][1][1] = basep; for (i = 2; i < 256; i++) { if (i&1) { p = d88->tables[exp][i^1][1]; d88->tables[exp][i][1] = p ^ basep; } else { p = d88->tables[exp][i>>1][1]; d88->tables[exp][i][1] = GF_MULTBY_TWO(p); } } for (i = 1; i < 256; i++) { p = d88->tables[exp][i][1]; for (j = 1; j < 256; j++) { if (j&1) { d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p; } else { d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]); } } } for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep); } } return 1; } int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) { switch(mult_type) { case GF_MULT_SHIFT: return sizeof(gf_internal_t); break; case GF_MULT_CARRY_FREE: return sizeof(gf_internal_t); break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: return sizeof(gf_internal_t); break; case GF_MULT_DEFAULT: /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, * then fall through to split table scratch size code. */ #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) arg1 = 64; arg2 = 4; #else arg1 = 64; arg2 = 8; #endif case GF_MULT_SPLIT_TABLE: if (arg1 == 8 && arg2 == 8) { return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64; } if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) { return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64; } if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) { return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64; } if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) { return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64; } return 0; case GF_MULT_GROUP: return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) + sizeof(uint64_t) * (1 << arg1) + sizeof(uint64_t) * (1 << arg2) + 64; break; case GF_MULT_COMPOSITE: if (arg1 == 2) return sizeof(gf_internal_t) + 64; return 0; break; default: return 0; } } int gf_w64_init(gf_t *gf) { gf_internal_t *h; h = (gf_internal_t *) gf->scratch; /* Allen: set default primitive polynomial / irreducible polynomial if needed */ /* Omitting the leftmost 1 as in w=32 */ if (h->prim_poly == 0) { if (h->mult_type == GF_MULT_COMPOSITE) { h->prim_poly = gf_composite_get_default_poly(h->base_gf); if (h->prim_poly == 0) return 0; /* This shouldn't happen */ } else { h->prim_poly = 0x1b; } } gf->multiply.w64 = NULL; gf->divide.w64 = NULL; gf->inverse.w64 = NULL; gf->multiply_region.w64 = NULL; switch(h->mult_type) { case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break; case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break; case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break; case GF_MULT_DEFAULT: case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break; case GF_MULT_BYTWO_p: case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break; default: return 0; } if (h->divide_type == GF_DIVIDE_EUCLID) { gf->divide.w64 = gf_w64_divide_from_inverse; gf->inverse.w64 = gf_w64_euclid; } if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) { gf->divide.w64 = gf_w64_divide_from_inverse; } if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) { gf->inverse.w64 = gf_w64_inverse_from_divide; } if (h->region_type == GF_REGION_CAUCHY) return 0; if (h->region_type & GF_REGION_ALTMAP) { if (h->mult_type == GF_MULT_COMPOSITE) { gf->extract_word.w64 = gf_w64_composite_extract_word; } else if (h->mult_type == GF_MULT_SPLIT_TABLE) { gf->extract_word.w64 = gf_w64_split_extract_word; } } else { gf->extract_word.w64 = gf_w64_extract_word; } return 1; }