2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Routines for 128-bit Galois fields
16 #define GF_FIELD_WIDTH (128)
20 if (a[1] & 1ULL << 63) a[0] ^= 1; \
23 #define a_get_b(a, i, b, j) {\
27 #define set_zero(a, i) {\
31 struct gf_w128_split_4_128_data {
32 uint64_t last_value[2];
33 uint64_t tables[2][32][16];
36 struct gf_w128_split_8_128_data {
37 uint64_t last_value[2];
38 uint64_t tables[2][16][256];
41 typedef struct gf_group_tables_s {
46 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
50 gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
59 /* We only do this to check on alignment. */
60 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
63 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
64 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
69 s128 = (gf_val_128_t) src;
70 d128 = (gf_val_128_t) dest;
73 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
74 gf->multiply.w128(gf, &s128[i], val, c128);
79 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
80 gf->multiply.w128(gf, &s128[i], val, &d128[i]);
85 #if defined(INTEL_SSE4_PCLMUL)
88 gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
96 __m128i result0,result1;
99 gf_internal_t * h = gf->scratch;
100 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
101 /* We only do this to check on alignment. */
102 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
105 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
106 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
109 s128 = (gf_val_128_t) src;
110 d128 = (gf_val_128_t) dest;
113 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
114 a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
115 b = _mm_insert_epi64 (a, val[1], 0);
116 a = _mm_insert_epi64 (a, s128[i], 1);
117 b = _mm_insert_epi64 (b, val[0], 1);
119 c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
120 f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
121 e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
122 d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
124 /* now reusing a and b as temporary variables*/
125 result0 = _mm_setzero_si128();
128 result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
129 a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
130 result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
132 a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
133 result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
134 result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
135 /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
137 a = _mm_srli_si128 (result0, 8);
138 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
139 result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
140 result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
142 a = _mm_insert_epi64 (result0, 0, 1);
143 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
144 result1 = _mm_xor_si128 (result1, b);
145 d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
146 d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
149 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
150 a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
151 b = _mm_insert_epi64 (a, val[1], 0);
152 a = _mm_insert_epi64 (a, s128[i], 1);
153 b = _mm_insert_epi64 (b, val[0], 1);
155 c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
156 f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
157 e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
158 d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
160 /* now reusing a and b as temporary variables*/
161 result0 = _mm_setzero_si128();
164 result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
165 a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
166 result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
168 a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
169 result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
170 result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
171 /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
173 a = _mm_srli_si128 (result0, 8);
174 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
175 result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
176 result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
178 a = _mm_insert_epi64 (result0, 0, 1);
179 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
180 result1 = _mm_xor_si128 (result1, b);
181 d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
182 d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
191 * --return values allocated beforehand
194 #define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
197 gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
199 /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */
200 uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit;
203 h = (gf_internal_t *) gf->scratch;
205 if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) {
210 a_get_b(a, 0, a128, 0);
211 a_get_b(br, 0, b128, 0);
220 /* Allen: a*b for right half of a */
221 for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
222 if (a[1] & (one << i)) {
228 if (br[0] & lbit) bl[1] ^= 1;
230 if (br[1] & lbit) br[0] ^= 1;
234 /* Allen: a*b for left half of a */
235 for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
236 if (a[0] & (one << i)) {
242 if (bl[1] & lbit) bl[0] ^= 1;
244 if (br[0] & lbit) bl[1] ^= 1;
248 /* Allen: do first half of reduction (based on left quarter of initial product) */
250 ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
251 ppl[1] = h->prim_poly >> 2;
252 ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
263 if (ppr[0] & 1) ppr[1] ^= lbit;
265 if (ppl[1] & 1) ppr[0] ^= lbit;
267 if (ppl[0] & 1) ppl[1] ^= lbit;
271 /* Allen: final half of reduction */
281 if (ppr[0] & 1) ppr[1] ^= lbit;
283 if (ppl[1] & 1) ppr[0] ^= lbit;
287 /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
294 #if defined(INTEL_SSE4_PCLMUL)
297 gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
300 __m128i result0,result1;
303 gf_internal_t * h = gf->scratch;
305 a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
306 b = _mm_insert_epi64 (a, b128[1], 0);
307 a = _mm_insert_epi64 (a, a128[0], 1);
308 b = _mm_insert_epi64 (b, b128[0], 1);
310 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
312 /* we need to test algorithm 2 later*/
313 c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
314 f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
315 e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
316 d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
318 /* now reusing a and b as temporary variables*/
319 result0 = _mm_setzero_si128();
322 result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
323 a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
324 result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
326 a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
327 result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
328 result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
329 /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
331 a = _mm_srli_si128 (result0, 8);
332 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
333 result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
334 result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
336 a = _mm_insert_epi64 (result0, 0, 1);
337 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
338 result1 = _mm_xor_si128 (result1, b);
340 c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
341 c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
346 gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
348 uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
349 uint64_t topbit; /* this is used as a boolean value */
352 h = (gf_internal_t *) gf->scratch;
356 pmask = 0x8000000000000000ULL;
357 amask[0] = 0x8000000000000000ULL;
360 while (amask[1] != 0 || amask[0] != 0) {
361 topbit = (prod[0] & pmask);
363 if (prod[1] & pmask) prod[0] ^= 1;
365 if (topbit) prod[1] ^= pp;
366 if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
371 if (amask[0] & 1) amask[1] ^= pmask;
379 #if defined(INTEL_SSE4)
381 gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
384 __m128i a, b, pp, prod, amask, u_middle_one;
385 /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
386 uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
390 h = (gf_internal_t *) gf->scratch;
391 pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
392 prod = _mm_setzero_si128();
393 a = _mm_insert_epi64(prod, a128[1], 0x0);
394 a = _mm_insert_epi64(a, a128[0], 0x1);
395 b = _mm_insert_epi64(prod, b128[1], 0x0);
396 b = _mm_insert_epi64(b, b128[0], 0x1);
398 amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
399 u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
401 for (i = 0; i < 64; i++) {
402 topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
403 middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
404 prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
406 prod = _mm_xor_si128(prod, u_middle_one);
409 prod = _mm_xor_si128(prod, pp);
411 if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
412 prod = _mm_xor_si128(prod, b);
414 amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
416 amask = _mm_insert_epi32(amask, (gf_val_32_t)1 << 31, 0x1);
417 for (i = 64; i < 128; i++) {
418 topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
419 middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
420 prod = _mm_slli_epi64(prod, 1);
421 if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
422 if (topbit) prod = _mm_xor_si128(prod, pp);
423 if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
424 prod = _mm_xor_si128(prod, b);
426 amask = _mm_srli_epi64(amask, 1);
428 c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
429 c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
435 /* Ben: This slow function implements sse instrutions for bytwo_b because why not */
436 #if defined(INTEL_SSE4)
438 gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
440 __m128i a, b, lmask, hmask, pp, c, middle_one;
442 uint64_t topbit, middlebit;
444 h = (gf_internal_t *) gf->scratch;
446 c = _mm_setzero_si128();
447 lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
448 hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
449 b = _mm_insert_epi64(c, a128[0], 1);
450 b = _mm_insert_epi64(b, a128[1], 0);
451 a = _mm_insert_epi64(c, b128[0], 1);
452 a = _mm_insert_epi64(a, b128[1], 0);
453 pp = _mm_insert_epi64(c, h->prim_poly, 0);
454 middle_one = _mm_insert_epi64(c, 1, 0x1);
457 if (_mm_extract_epi32(a, 0x0) & 1) {
458 c = _mm_xor_si128(c, b);
460 middlebit = (_mm_extract_epi32(a, 0x2) & 1);
461 a = _mm_srli_epi64(a, 1);
462 if (middlebit) a = _mm_xor_si128(a, lmask);
463 if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
464 c128[0] = _mm_extract_epi64(c, 0x1);
465 c128[1] = _mm_extract_epi64(c, 0x0);
468 topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
469 middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
470 b = _mm_slli_epi64(b, 1);
471 if (middlebit) b = _mm_xor_si128(b, middle_one);
472 if (topbit) b = _mm_xor_si128(b, pp);
478 gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
482 uint64_t a[2], b[2], c[2];
484 h = (gf_internal_t *) gf->scratch;
486 bmask = (1ULL << 63);
499 if (a[0] & 1) a[1] ^= bmask;
501 if (a[1] == 0 && a[0] == 0) {
508 if (b[1] & bmask) b[0] ^= 1;
510 if (pp) b[1] ^= h->prim_poly;
516 gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
521 uint64_t *s64, *d64, *top;
524 struct gf_w128_split_4_128_data *ld;
526 /* We only do this to check on alignment. */
527 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
530 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
531 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
534 h = (gf_internal_t *) gf->scratch;
535 ld = (struct gf_w128_split_4_128_data *) h->private;
537 s64 = (uint64_t *) rd.s_start;
538 d64 = (uint64_t *) rd.d_start;
539 top = (uint64_t *) rd.d_top;
541 if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
544 for (i = 0; i < 32; i++) {
545 ld->tables[0][i][0] = 0;
546 ld->tables[1][i][0] = 0;
547 for (j = 1; j < 16; j <<= 1) {
548 for (k = 0; k < j; k++) {
549 ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
550 ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
552 pp = (v[0] & (1ULL << 63));
554 if (v[1] & (1ULL << 63)) v[0] ^= 1;
556 if (pp) v[1] ^= h->prim_poly;
560 ld->last_value[0] = val[0];
561 ld->last_value[1] = val[1];
564 for (i = 0; i < 32; i++) {
565 for (j = 0; j < 16; j++) {
566 printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
572 v[0] = (xor) ? d64[0] : 0;
573 v[1] = (xor) ? d64[1] : 0;
577 v[0] ^= ld->tables[0][i][s&0xf];
578 v[1] ^= ld->tables[1][i][s&0xf];
585 v[0] ^= ld->tables[0][i][s&0xf];
586 v[1] ^= ld->tables[1][i][s&0xf];
597 #if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
600 gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
604 uint64_t pp, v[2], s, *s64, *d64, *top;
605 __m128i p, tables[32][16];
606 struct gf_w128_split_4_128_data *ld;
610 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
611 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
614 h = (gf_internal_t *) gf->scratch;
616 /* We only do this to check on alignment. */
617 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);
619 /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
621 gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
623 s64 = (uint64_t *) rd.s_start;
624 d64 = (uint64_t *) rd.d_start;
625 top = (uint64_t *) rd.d_top;
627 ld = (struct gf_w128_split_4_128_data *) h->private;
629 if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
632 for (i = 0; i < 32; i++) {
633 ld->tables[0][i][0] = 0;
634 ld->tables[1][i][0] = 0;
635 for (j = 1; j < 16; j <<= 1) {
636 for (k = 0; k < j; k++) {
637 ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
638 ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
640 pp = (v[0] & (1ULL << 63));
642 if (v[1] & (1ULL << 63)) v[0] ^= 1;
644 if (pp) v[1] ^= h->prim_poly;
649 ld->last_value[0] = val[0];
650 ld->last_value[1] = val[1];
652 for (i = 0; i < 32; i++) {
653 for (j = 0; j < 16; j++) {
654 v[0] = ld->tables[0][i][j];
655 v[1] = ld->tables[1][i][j];
656 tables[i][j] = _mm_loadu_si128((__m128i *) v);
659 printf("%2d %2d: ", i, j);
660 MM_PRINT8("", tables[i][j]); */
667 p = _mm_load_si128 ((__m128i *) d64);
669 p = _mm_setzero_si128();
673 for (i = 0; i < 16; i++) {
676 p = _mm_xor_si128(p, tables[16+i][j]);
680 for (i = 0; i < 16; i++) {
683 p = _mm_xor_si128(p, tables[i][j]);
685 _mm_store_si128((__m128i *) d64, p);
689 /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
691 gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
695 #if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
698 gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
702 uint64_t pp, v[2], *s64, *d64, *top;
703 __m128i si, tables[32][16], p[16], v0, mask1;
704 struct gf_w128_split_4_128_data *ld;
709 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
710 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
713 h = (gf_internal_t *) gf->scratch;
715 /* We only do this to check on alignment. */
716 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
718 /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
720 gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
722 s64 = (uint64_t *) rd.s_start;
723 d64 = (uint64_t *) rd.d_start;
724 top = (uint64_t *) rd.d_top;
726 ld = (struct gf_w128_split_4_128_data *) h->private;
728 if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
731 for (i = 0; i < 32; i++) {
732 ld->tables[0][i][0] = 0;
733 ld->tables[1][i][0] = 0;
734 for (j = 1; j < 16; j <<= 1) {
735 for (k = 0; k < j; k++) {
736 ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
737 ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
739 pp = (v[0] & (1ULL << 63));
741 if (v[1] & (1ULL << 63)) v[0] ^= 1;
743 if (pp) v[1] ^= h->prim_poly;
748 ld->last_value[0] = val[0];
749 ld->last_value[1] = val[1];
751 for (i = 0; i < 32; i++) {
752 for (j = 0; j < 16; j++) {
753 for (k = 0; k < 16; k++) {
754 btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
755 ld->tables[1-(j/8)][i][k] >>= 8;
757 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
759 printf("%2d %2d: ", i, j);
760 MM_PRINT8("", tables[i][j]);
766 mask1 = _mm_set1_epi8(0xf);
771 for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
773 for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
776 for (k = 0; k < 16; k++) {
777 v0 = _mm_load_si128((__m128i *) s64);
780 si = _mm_and_si128(v0, mask1);
782 for (j = 0; j < 16; j++) {
783 p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
786 v0 = _mm_srli_epi32(v0, 4);
787 si = _mm_and_si128(v0, mask1);
788 for (j = 0; j < 16; j++) {
789 p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
793 for (i = 0; i < 16; i++) {
794 _mm_store_si128((__m128i *) d64, p[i]);
798 /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
800 gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
806 gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
811 uint64_t *s64, *d64, *top;
814 struct gf_w128_split_8_128_data *ld;
816 /* Check on alignment. Ignore it otherwise. */
817 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
820 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
821 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
824 h = (gf_internal_t *) gf->scratch;
825 ld = (struct gf_w128_split_8_128_data *) h->private;
827 s64 = (uint64_t *) rd.s_start;
828 d64 = (uint64_t *) rd.d_start;
829 top = (uint64_t *) rd.d_top;
831 if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
834 for (i = 0; i < 16; i++) {
835 ld->tables[0][i][0] = 0;
836 ld->tables[1][i][0] = 0;
837 for (j = 1; j < (1 << 8); j <<= 1) {
838 for (k = 0; k < j; k++) {
839 ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
840 ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
842 pp = (v[0] & (1ULL << 63));
844 if (v[1] & (1ULL << 63)) v[0] ^= 1;
846 if (pp) v[1] ^= h->prim_poly;
850 ld->last_value[0] = val[0];
851 ld->last_value[1] = val[1];
854 v[0] = (xor) ? d64[0] : 0;
855 v[1] = (xor) ? d64[1] : 0;
859 v[0] ^= ld->tables[0][i][s&0xff];
860 v[1] ^= ld->tables[1][i][s&0xff];
867 v[0] ^= ld->tables[0][i][s&0xff];
868 v[1] ^= ld->tables[1][i][s&0xff];
880 gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
884 uint64_t a[2], c[2], b[2], *s64, *d64, *top;
887 /* We only do this to check on alignment. */
888 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
891 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
892 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
895 h = (gf_internal_t *) gf->scratch;
896 s64 = (uint64_t *) rd.s_start;
897 d64 = (uint64_t *) rd.d_start;
898 top = (uint64_t *) rd.d_top;
899 bmask = (1ULL << 63);
914 if (a[0] & 1) a[1] ^= bmask;
918 if (b[1] & bmask) b[0] ^= 1;
920 if (pp) b[1] ^= h->prim_poly;
928 if (a[1] == 0) break;
931 if (b[1] & bmask) b[0] ^= 1;
933 if (pp) b[1] ^= h->prim_poly;
948 void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
952 uint64_t prim_poly, lbit;
953 gf_internal_t *scratch;
954 gf_group_tables_t *gt;
956 scratch = (gf_internal_t *) gf->scratch;
957 gt = scratch->private;
959 prim_poly = scratch->prim_poly;
962 set_zero(gt->m_table, 0);
963 a_get_b(gt->m_table, 2, b128, 0);
967 for (i = 2; i < (1 << g_m); i <<= 1) {
968 a_get_b(a128, 0, gt->m_table, 2 * (i >> 1));
970 a_get_b(gt->m_table, 2 * i, a128, 0);
971 if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly;
972 for (j = 0; j < i; j++) {
973 gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)];
974 gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1];
981 gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
984 /* index_r, index_m, total_m (if g_r > g_m) */
988 uint64_t p_i[2], a[2];
989 gf_internal_t *scratch;
990 gf_group_tables_t *gt;
992 scratch = (gf_internal_t *) gf->scratch;
993 gt = scratch->private;
997 mask_m = (1 << g_m) - 1;
998 mask_r = (1 << g_r) - 1;
1000 if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
1001 gf_w128_group_m_init(gf, b128);
1013 for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1014 i_m = (a[0] >> (i * g_m)) & mask_m;
1015 i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1017 p_i[0] ^= (p_i[1] >> (64-g_m));
1019 p_i[0] ^= gt->m_table[2 * i_m];
1020 p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1023 p_i[1] ^= gt->r_table[i_r];
1031 for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1032 i_m = (a[1] >> (i * g_m)) & mask_m;
1033 i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1035 p_i[0] ^= (p_i[1] >> (64-g_m));
1037 p_i[0] ^= gt->m_table[2 * i_m];
1038 p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1041 p_i[1] ^= gt->r_table[i_r];
1054 gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
1060 uint64_t p_i[2], a[2];
1061 gf_internal_t *scratch;
1062 gf_group_tables_t *gt;
1064 uint64_t *a128, *c128, *top;
1066 /* We only do this to check on alignment. */
1067 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
1070 if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
1071 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1074 scratch = (gf_internal_t *) gf->scratch;
1075 gt = scratch->private;
1076 g_m = scratch->arg1;
1077 g_r = scratch->arg2;
1079 mask_m = (1 << g_m) - 1;
1080 mask_r = (1 << g_r) - 1;
1082 if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) {
1083 gf_w128_group_m_init(gf, val);
1086 a128 = (uint64_t *) src;
1087 c128 = (uint64_t *) dest;
1088 top = (uint64_t *) rd.d_top;
1090 while (c128 < top) {
1100 for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1101 i_m = (a[0] >> (i * g_m)) & mask_m;
1102 i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1104 p_i[0] ^= (p_i[1] >> (64-g_m));
1107 p_i[0] ^= gt->m_table[2 * i_m];
1108 p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1111 p_i[1] ^= gt->r_table[i_r];
1118 for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1119 i_m = (a[1] >> (i * g_m)) & mask_m;
1120 i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1122 p_i[0] ^= (p_i[1] >> (64-g_m));
1124 p_i[0] ^= gt->m_table[2 * i_m];
1125 p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1128 p_i[1] ^= gt->r_table[i_r];
1150 gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1152 uint64_t e_i[2], e_im1[2], e_ip1[2];
1153 uint64_t d_i, d_im1, d_ip1;
1154 uint64_t y_i[2], y_im1[2], y_ip1[2];
1159 /* This needs to return some sort of error (in b128?) */
1160 if (a128[0] == 0 && a128[1] == 0) return;
1162 b = (uint64_t *) b128;
1165 e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly;
1170 //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
1171 //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
1172 //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not
1174 for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
1176 //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet
1178 if (!((one << d_i) & e_i[0])) {
1180 //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
1181 // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.
1183 for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;
1187 //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
1196 while (!(e_i[0] == 0 && e_i[1] == 1)) {
1198 e_ip1[0] = e_im1[0];
1199 e_ip1[1] = e_im1[1];
1204 while (d_ip1 >= d_i) {
1205 if ((d_ip1 - d_i) >= 64) {
1206 c_i[0] ^= (one << ((d_ip1 - d_i) - 64));
1207 e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64));
1209 c_i[1] ^= (one << (d_ip1 - d_i));
1210 e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i));
1211 if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
1212 e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
1215 if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
1216 while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
1217 while (d_ip1 < 64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
1219 gf->multiply.w128(gf, c_i, y_i, y_ip1);
1220 y_ip1[0] ^= y_im1[0];
1221 y_ip1[1] ^= y_im1[1];
1243 gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
1246 gf->inverse.w128(gf, b128, d);
1247 gf->multiply.w128(gf, a128, d, c128);
1252 gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1257 gf->divide.w128(gf, one128, a128, b128);
1264 gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
1266 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1267 gf_t *base_gf = h->base_gf;
1270 uint64_t c0, c1, d, tmp;
1271 uint64_t a0inv, a1inv;
1274 a1inv = base_gf->inverse.w64(base_gf, a1);
1275 c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
1277 } else if (a1 == 0) {
1278 c0 = base_gf->inverse.w64(base_gf, a0);
1281 a1inv = base_gf->inverse.w64(base_gf, a1);
1282 a0inv = base_gf->inverse.w64(base_gf, a0);
1284 d = base_gf->multiply.w64(base_gf, a1, a0inv);
1286 tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
1287 tmp = base_gf->inverse.w64(base_gf, tmp);
1289 d = base_gf->multiply.w64(base_gf, d, tmp);
1291 c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
1292 c1 = base_gf->multiply.w64(base_gf, d, a1inv);
1300 gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
1302 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1303 gf_t *base_gf = h->base_gf;
1310 a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
1312 rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
1313 rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^
1314 base_gf->multiply.w64(base_gf, a0, b1) ^
1315 base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
1320 gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
1322 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1323 gf_t *base_gf = h->base_gf;
1324 uint64_t b0 = val[1];
1325 uint64_t b1 = val[0];
1326 uint64_t *s64, *d64;
1328 uint64_t a0, a1, a1b1;
1331 if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
1333 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
1343 a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
1345 d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
1346 d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^
1347 base_gf->multiply.w64(base_gf, a0, b1) ^
1348 base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
1356 a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
1358 d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
1359 d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^
1360 base_gf->multiply.w64(base_gf, a0, b1) ^
1361 base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
1370 gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int
1373 gf_internal_t *h = (gf_internal_t *) gf->scratch; gf_t *base_gf = h->base_gf;
1374 gf_val_64_t val0 = val[1];
1375 gf_val_64_t val1 = val[0];
1376 uint8_t *slow, *shigh;
1377 uint8_t *dlow, *dhigh, *top;
1381 gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
1382 gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
1384 slow = (uint8_t *) rd.s_start;
1385 dlow = (uint8_t *) rd.d_start;
1386 top = (uint8_t*) rd.d_top;
1387 sub_reg_size = (top - dlow)/2;
1388 shigh = slow + sub_reg_size;
1389 dhigh = dlow + sub_reg_size;
1391 base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
1392 base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
1393 base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
1394 base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
1395 base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
1396 ), sub_reg_size, 1);
1398 gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
1403 int gf_w128_composite_init(gf_t *gf)
1405 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1407 if (h->region_type & GF_REGION_ALTMAP) {
1408 SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt)
1410 SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region)
1413 SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply)
1414 SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
1415 SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse)
1421 int gf_w128_cfm_init(gf_t *gf)
1423 #if defined(INTEL_SSE4_PCLMUL)
1424 if (gf_cpu_supports_intel_pclmul) {
1425 SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1426 SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1427 SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
1436 int gf_w128_shift_init(gf_t *gf)
1438 SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply)
1439 SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1440 SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single)
1445 int gf_w128_bytwo_init(gf_t *gf)
1448 h = (gf_internal_t *) gf->scratch;
1450 if (h->mult_type == GF_MULT_BYTWO_p) {
1451 SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
1452 /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/
1453 /* John: the sse function is slower.*/
1455 SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply)
1456 /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply)
1457 Ben: This sse function is also slower. */
1459 SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1460 SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region)
1465 * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
1466 * bits in all of these numbers.
1469 void gf_w128_group_r_init(gf_t *gf)
1474 gf_internal_t *scratch;
1475 gf_group_tables_t *gt;
1476 scratch = (gf_internal_t *) gf->scratch;
1477 gt = scratch->private;
1478 g_r = scratch->arg2;
1479 pp = scratch->prim_poly;
1482 for (i = 1; i < (1 << g_r); i++) {
1484 for (j = 0; j < g_r; j++) {
1486 gt->r_table[i] ^= (pp << j);
1493 #if 0 // defined(INTEL_SSE4)
1495 void gf_w128_group_r_sse_init(gf_t *gf)
1500 gf_internal_t *scratch;
1501 gf_group_tables_t *gt;
1502 scratch = (gf_internal_t *) gf->scratch;
1503 gt = scratch->private;
1504 __m128i zero = _mm_setzero_si128();
1505 __m128i *table = (__m128i *)(gt->r_table);
1506 g_r = scratch->arg2;
1507 pp = scratch->prim_poly;
1509 for (i = 1; i < (1 << g_r); i++) {
1511 for (j = 0; j < g_r; j++) {
1513 table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
1522 int gf_w128_split_init(gf_t *gf)
1524 struct gf_w128_split_4_128_data *sd4;
1525 struct gf_w128_split_8_128_data *sd8;
1528 h = (gf_internal_t *) gf->scratch;
1530 SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
1531 #if defined(INTEL_SSE4_PCLMUL)
1532 if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
1533 SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1537 SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1539 if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
1540 sd8 = (struct gf_w128_split_8_128_data *) h->private;
1541 sd8->last_value[0] = 0;
1542 sd8->last_value[1] = 0;
1543 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region)
1545 sd4 = (struct gf_w128_split_4_128_data *) h->private;
1546 sd4->last_value[0] = 0;
1547 sd4->last_value[1] = 0;
1548 if((h->region_type & GF_REGION_ALTMAP))
1551 if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1552 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
1559 if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1560 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
1563 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
1571 int gf_w128_group_init(gf_t *gf)
1573 gf_internal_t *scratch;
1574 gf_group_tables_t *gt;
1577 scratch = (gf_internal_t *) gf->scratch;
1578 gt = scratch->private;
1579 g_r = scratch->arg2;
1580 size_r = (1 << g_r);
1582 gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *)));
1583 gt->m_table = gt->r_table + size_r;
1587 SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply)
1588 SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1589 SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region)
1591 gf_w128_group_r_init(gf);
1596 void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
1600 s = (gf_val_128_t) start;
1605 static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
1612 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
1613 r64 = (uint64_t *) start;
1614 if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
1615 (r64 + index*2 >= (uint64_t *) rd.d_top)) {
1616 memcpy(rv, r64+(index*2), 16);
1620 index -= (((uint64_t *) rd.d_start) - r64)/2;
1621 r64 = (uint64_t *) rd.d_start;
1626 r8 = (uint8_t *) r64;
1631 for (i = 0; i < 8; i++) {
1633 rv[1] |= (tmp << (i*8));
1637 for (i = 0; i < 8; i++) {
1639 rv[0] |= (tmp << (i*8));
1646 void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
1654 h = (gf_internal_t *) gf->scratch;
1655 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
1656 r64 = (uint64_t *) start;
1657 if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
1658 (r64 + index*2 >= (uint64_t *) rd.d_top)) {
1659 memcpy(rv, r64+(index*2), 16);
1662 index -= (((uint64_t *) rd.d_start) - r64)/2;
1663 r8 = (uint8_t *) rd.d_start;
1664 top = (uint8_t *) rd.d_top;
1665 sub_size = (top-r8)/2;
1667 rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
1668 rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
1673 int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
1676 if (divide_type==GF_DIVIDE_MATRIX) return 0;
1680 case GF_MULT_CARRY_FREE:
1681 return sizeof(gf_internal_t);
1684 return sizeof(gf_internal_t);
1686 case GF_MULT_BYTWO_p:
1687 case GF_MULT_BYTWO_b:
1688 return sizeof(gf_internal_t);
1690 case GF_MULT_DEFAULT:
1691 case GF_MULT_SPLIT_TABLE:
1692 if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
1693 return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
1694 } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
1695 return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
1700 /* JSP We've already error checked the arguments. */
1701 size_m = (1 << arg1) * 2 * sizeof(uint64_t);
1702 size_r = (1 << arg2) * 2 * sizeof(uint64_t);
1704 * two pointers prepend the table data for structure
1705 * because the tables are of dynamic size
1707 return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
1709 case GF_MULT_COMPOSITE:
1711 return sizeof(gf_internal_t) + 4;
1722 int gf_w128_init(gf_t *gf)
1726 h = (gf_internal_t *) gf->scratch;
1728 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
1730 if (h->prim_poly == 0) {
1731 if (h->mult_type == GF_MULT_COMPOSITE) {
1732 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
1733 if (h->prim_poly == 0) return 0; /* This shouldn't happen */
1735 h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
1739 SET_FUNCTION(gf,multiply,w128,NULL)
1740 SET_FUNCTION(gf,divide,w128,NULL)
1741 SET_FUNCTION(gf,inverse,w128,NULL)
1742 SET_FUNCTION(gf,multiply_region,w128,NULL)
1743 switch(h->mult_type) {
1744 case GF_MULT_BYTWO_p:
1745 case GF_MULT_BYTWO_b: if (gf_w128_bytwo_init(gf) == 0) return 0; break;
1746 case GF_MULT_CARRY_FREE: if (gf_w128_cfm_init(gf) == 0) return 0; break;
1747 case GF_MULT_SHIFT: if (gf_w128_shift_init(gf) == 0) return 0; break;
1748 case GF_MULT_GROUP: if (gf_w128_group_init(gf) == 0) return 0; break;
1749 case GF_MULT_DEFAULT:
1750 case GF_MULT_SPLIT_TABLE: if (gf_w128_split_init(gf) == 0) return 0; break;
1751 case GF_MULT_COMPOSITE: if (gf_w128_composite_init(gf) == 0) return 0; break;
1755 /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
1756 are multiple flags in h->region_type */
1757 if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
1758 SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word)
1759 } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
1760 SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word)
1762 SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word)
1765 if (h->divide_type == GF_DIVIDE_EUCLID) {
1766 SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
1769 if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
1770 SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
1772 if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) {
1773 SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide)