2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Routines for 64-bit Galois fields
19 gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
21 return gf->divide.w64(gf, 1, a);
24 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
28 gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
30 b = gf->inverse.w64(gf, b);
31 return gf->multiply.w64(gf, a, b);
36 gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
43 s64 = (gf_val_64_t *) src;
44 d64 = (gf_val_64_t *) dest;
46 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
47 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
50 for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
51 d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
54 for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
55 d64[i] = gf->multiply.w64(gf, val, s64[i]);
60 #if defined(INTEL_SSE4_PCLMUL)
63 gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
66 gf_val_64_t *s64, *d64, *top;
74 gf_internal_t * h = gf->scratch;
76 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
77 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
79 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
80 gf_do_initial_region_alignment(&rd);
82 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
83 b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
84 m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
85 m3 = _mm_slli_si128(m1, 8);
86 m4 = _mm_slli_si128(m3, 4);
88 s64 = (gf_val_64_t *) rd.s_start;
89 d64 = (gf_val_64_t *) rd.d_start;
90 top = (gf_val_64_t *) rd.d_top;
94 a = _mm_load_si128((__m128i *) s64);
95 result = _mm_clmulepi64_si128 (a, b, 1);
97 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
98 result = _mm_xor_si128 (result, w);
99 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
100 r1 = _mm_xor_si128 (result, w);
102 result = _mm_clmulepi64_si128 (a, b, 0);
104 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
105 result = _mm_xor_si128 (result, w);
107 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
108 result = _mm_xor_si128 (result, w);
110 result = _mm_unpacklo_epi64(result, r1);
112 r1 = _mm_load_si128((__m128i *) d64);
113 result = _mm_xor_si128(r1, result);
114 _mm_store_si128((__m128i *) d64, result);
121 a = _mm_load_si128((__m128i *) s64);
122 result = _mm_clmulepi64_si128 (a, b, 1);
124 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
125 result = _mm_xor_si128 (result, w);
126 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
127 r1 = _mm_xor_si128 (result, w);
129 result = _mm_clmulepi64_si128 (a, b, 0);
131 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
132 result = _mm_xor_si128 (result, w);
133 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
134 result = _mm_xor_si128 (result, w);
136 result = _mm_unpacklo_epi64(result, r1);
138 _mm_store_si128((__m128i *) d64, result);
143 gf_do_final_region_alignment(&rd);
147 #if defined(INTEL_SSE4_PCLMUL)
150 gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
153 gf_val_64_t *s64, *d64, *top;
161 gf_internal_t * h = gf->scratch;
163 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
164 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
166 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
167 gf_do_initial_region_alignment(&rd);
169 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
170 b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
171 m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
172 m3 = _mm_slli_si128(m1, 8);
173 m4 = _mm_slli_si128(m3, 4);
175 s64 = (gf_val_64_t *) rd.s_start;
176 d64 = (gf_val_64_t *) rd.d_start;
177 top = (gf_val_64_t *) rd.d_top;
181 a = _mm_load_si128((__m128i *) s64);
182 result = _mm_clmulepi64_si128 (a, b, 1);
184 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
185 result = _mm_xor_si128 (result, w);
186 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
187 r1 = _mm_xor_si128 (result, w);
189 result = _mm_clmulepi64_si128 (a, b, 0);
191 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
192 result = _mm_xor_si128 (result, w);
194 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
195 result = _mm_xor_si128 (result, w);
197 result = _mm_unpacklo_epi64(result, r1);
199 r1 = _mm_load_si128((__m128i *) d64);
200 result = _mm_xor_si128(r1, result);
201 _mm_store_si128((__m128i *) d64, result);
207 a = _mm_load_si128((__m128i *) s64);
208 result = _mm_clmulepi64_si128 (a, b, 1);
210 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
211 result = _mm_xor_si128 (result, w);
212 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
213 r1 = _mm_xor_si128 (result, w);
215 result = _mm_clmulepi64_si128 (a, b, 0);
217 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
218 result = _mm_xor_si128 (result, w);
219 w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
220 result = _mm_xor_si128 (result, w);
222 result = _mm_unpacklo_epi64(result, r1);
224 _mm_store_si128((__m128i *) d64, result);
229 gf_do_final_region_alignment(&rd);
235 gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
237 gf_val_64_t e_i, e_im1, e_ip1;
238 gf_val_64_t d_i, d_im1, d_ip1;
239 gf_val_64_t y_i, y_im1, y_ip1;
243 if (b == 0) return -1;
244 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
247 for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ;
257 while (d_ip1 >= d_i) {
258 c_i ^= (one << (d_ip1 - d_i));
259 e_ip1 ^= (e_i << (d_ip1 - d_i));
261 if (e_ip1 == 0) return 0;
262 while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
265 y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i);
278 /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
279 include it for completeness. It does have the feature that it requires no
286 gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
288 uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit;
291 h = (gf_internal_t *) gf->scratch;
293 /* Allen: set leading one of primitive polynomial */
301 pl = 0; /* Allen: left side of product */
302 pr = 0; /* Allen: right side of product */
304 /* Allen: unlike the corresponding functions for smaller word sizes,
305 * this loop carries out the initial carryless multiply by
306 * shifting b itself rather than simply looking at successively
307 * higher shifts of b */
309 for (i = 0; i < GF_FIELD_WIDTH; i++) {
310 if (a & (one << i)) {
316 if (br & lbit) bl ^= 1;
320 /* Allen: the name of the variable "one" is no longer descriptive at this point */
323 ppl = (h->prim_poly >> 2) | one;
324 ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
332 if (ppl & 1) ppr ^= lbit;
339 * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
342 #if defined(INTEL_SSE4_PCLMUL)
347 gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
355 gf_internal_t * h = gf->scratch;
357 a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
358 b = _mm_insert_epi64 (a, b64, 0);
359 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
360 /* Do the initial multiply */
362 result = _mm_clmulepi64_si128 (a, b, 0);
364 /* Mask off the high order 32 bits using subtraction of the polynomial.
365 * NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
368 /* Adam: We cant include the leading one in the 64 bit pclmul,
369 so we need to split up the high 8 bytes of the result into two
370 parts before we multiply them with the prim_poly.*/
372 v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
373 w = _mm_clmulepi64_si128 (prim_poly, v, 0);
374 result = _mm_xor_si128 (result, w);
375 v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
376 w = _mm_clmulepi64_si128 (prim_poly, v, 0);
377 result = _mm_xor_si128 (result, w);
379 rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
384 #if defined(INTEL_SSE4_PCLMUL)
389 gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
397 gf_internal_t * h = gf->scratch;
399 a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
400 b = _mm_insert_epi64 (a, b64, 0);
401 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
403 /* Do the initial multiply */
405 result = _mm_clmulepi64_si128 (a, b, 0);
407 v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
408 w = _mm_clmulepi64_si128 (prim_poly, v, 0);
409 result = _mm_xor_si128 (result, w);
410 v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
411 w = _mm_clmulepi64_si128 (prim_poly, v, 0);
412 result = _mm_xor_si128 (result, w);
414 v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
415 w = _mm_clmulepi64_si128 (prim_poly, v, 0);
416 result = _mm_xor_si128 (result, w);
417 v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
418 w = _mm_clmulepi64_si128 (prim_poly, v, 0);
419 result = _mm_xor_si128 (result, w);
421 rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
427 #if defined(INTEL_SSE4_PCLMUL)
429 gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
432 uint8_t *s8, *d8, *dtop;
434 __m128i v, b, m, prim_poly, c, fr, w, result;
436 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
437 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
439 h = (gf_internal_t *) gf->scratch;
441 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
442 gf_do_initial_region_alignment(&rd);
444 s8 = (uint8_t *) rd.s_start;
445 d8 = (uint8_t *) rd.d_start;
446 dtop = (uint8_t *) rd.d_top;
448 v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
449 m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
450 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
454 b = _mm_load_si128((__m128i *) s8);
455 result = _mm_clmulepi64_si128 (b, v, 0);
456 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
457 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
458 result = _mm_xor_si128 (result, w);
459 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
460 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
461 fr = _mm_xor_si128 (result, w);
462 fr = _mm_and_si128 (fr, m);
464 result = _mm_clmulepi64_si128 (b, v, 1);
465 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
466 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
467 result = _mm_xor_si128 (result, w);
468 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
469 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
470 result = _mm_xor_si128 (result, w);
471 result = _mm_slli_si128 (result, 8);
472 fr = _mm_xor_si128 (result, fr);
473 result = _mm_load_si128((__m128i *) d8);
474 fr = _mm_xor_si128 (result, fr);
476 _mm_store_si128((__m128i *) d8, fr);
482 b = _mm_load_si128((__m128i *) s8);
483 result = _mm_clmulepi64_si128 (b, v, 0);
484 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
485 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
486 result = _mm_xor_si128 (result, w);
487 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
488 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
489 fr = _mm_xor_si128 (result, w);
490 fr = _mm_and_si128 (fr, m);
492 result = _mm_clmulepi64_si128 (b, v, 1);
493 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
494 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
495 result = _mm_xor_si128 (result, w);
496 c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
497 w = _mm_clmulepi64_si128 (prim_poly, c, 0);
498 result = _mm_xor_si128 (result, w);
499 result = _mm_slli_si128 (result, 8);
500 fr = _mm_xor_si128 (result, fr);
502 _mm_store_si128((__m128i *) d8, fr);
507 gf_do_final_region_alignment(&rd);
512 gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
515 struct gf_split_4_64_lazy_data *ld;
517 uint64_t pp, v, s, *s64, *d64, *top;
520 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
521 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
523 h = (gf_internal_t *) gf->scratch;
526 ld = (struct gf_split_4_64_lazy_data *) h->private;
528 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
529 gf_do_initial_region_alignment(&rd);
531 if (ld->last_value != val) {
533 for (i = 0; i < 16; i++) {
534 ld->tables[i][0] = 0;
535 for (j = 1; j < 16; j <<= 1) {
536 for (k = 0; k < j; k++) {
537 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
539 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
543 ld->last_value = val;
545 s64 = (uint64_t *) rd.s_start;
546 d64 = (uint64_t *) rd.d_start;
547 top = (uint64_t *) rd.d_top;
550 v = (xor) ? *d64 : 0;
554 v ^= ld->tables[i][s&0xf];
562 gf_do_final_region_alignment(&rd);
568 gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64)
570 uint64_t product, i, j, mask, tb;
572 struct gf_split_8_8_data *d8;
574 h = (gf_internal_t *) gf->scratch;
575 d8 = (struct gf_split_8_8_data *) h->private;
579 for (i = 0; a64 != 0; i++) {
581 for (j = 0; tb != 0; j++) {
582 product ^= d8->tables[i+j][a64&mask][tb&mask];
591 gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
594 struct gf_split_8_64_lazy_data *ld;
596 uint64_t pp, v, s, *s64, *d64, *top;
599 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
600 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
602 h = (gf_internal_t *) gf->scratch;
605 ld = (struct gf_split_8_64_lazy_data *) h->private;
607 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
608 gf_do_initial_region_alignment(&rd);
610 if (ld->last_value != val) {
612 for (i = 0; i < 8; i++) {
613 ld->tables[i][0] = 0;
614 for (j = 1; j < 256; j <<= 1) {
615 for (k = 0; k < j; k++) {
616 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
618 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
622 ld->last_value = val;
624 s64 = (uint64_t *) rd.s_start;
625 d64 = (uint64_t *) rd.d_start;
626 top = (uint64_t *) rd.d_top;
629 v = (xor) ? *d64 : 0;
633 v ^= ld->tables[i][s&0xff];
641 gf_do_final_region_alignment(&rd);
645 gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
648 struct gf_split_16_64_lazy_data *ld;
650 uint64_t pp, v, s, *s64, *d64, *top;
653 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
654 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
656 h = (gf_internal_t *) gf->scratch;
659 ld = (struct gf_split_16_64_lazy_data *) h->private;
661 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
662 gf_do_initial_region_alignment(&rd);
664 if (ld->last_value != val) {
666 for (i = 0; i < 4; i++) {
667 ld->tables[i][0] = 0;
668 for (j = 1; j < (1<<16); j <<= 1) {
669 for (k = 0; k < j; k++) {
670 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
672 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
676 ld->last_value = val;
678 s64 = (uint64_t *) rd.s_start;
679 d64 = (uint64_t *) rd.d_start;
680 top = (uint64_t *) rd.d_top;
683 v = (xor) ? *d64 : 0;
687 v ^= ld->tables[i][s&0xffff];
695 gf_do_final_region_alignment(&rd);
699 int gf_w64_shift_init(gf_t *gf)
701 SET_FUNCTION(gf,multiply,w64,gf_w64_shift_multiply)
702 SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
703 SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
708 int gf_w64_cfm_init(gf_t *gf)
710 SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
711 SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
713 #if defined(INTEL_SSE4_PCLMUL)
714 if (gf_cpu_supports_intel_pclmul) {
717 h = (gf_internal_t *) gf->scratch;
719 if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
720 SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
721 SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
722 }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
723 SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
724 SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
737 gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
747 for (i = 1; i < ((uint64_t)1 << g_s); i <<= 1) {
748 for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
749 if (val & (one << 63)) {
761 gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
763 uint64_t top, bot, mask, tp;
764 int g_s, g_r, lshift, rshift;
765 struct gf_w64_group_data *gd;
767 gf_internal_t *h = (gf_internal_t *) gf->scratch;
770 gd = (struct gf_w64_group_data *) h->private;
771 gf_w64_group_set_shift_tables(gd->shift, b, h);
773 mask = (((uint64_t)1 << g_s) - 1);
775 bot = gd->shift[a&mask];
778 if (a == 0) return bot;
782 do { /* Shifting out is straightfoward */
785 tp = gd->shift[a&mask];
786 top ^= (tp >> rshift);
787 bot ^= (tp << lshift);
791 /* Reducing is a bit gross, because I don't zero out the index bits of top.
792 The reason is that we throw top away. Even better, that last (tp >> rshift)
793 is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */
795 lshift = ((lshift-1) / g_r) * g_r;
796 rshift = 64 - lshift;
797 mask = ((uint64_t)1 << g_r) - 1;
798 while (lshift >= 0) {
799 tp = gd->reduce[(top >> lshift) & mask];
800 top ^= (tp >> rshift);
801 bot ^= (tp << lshift);
810 void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
813 uint64_t a64, smask, rmask, top, bot, tp;
814 int lshift, rshift, g_s, g_r;
816 uint64_t *s64, *d64, *dtop;
817 struct gf_w64_group_data *gd;
818 gf_internal_t *h = (gf_internal_t *) gf->scratch;
820 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
821 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
823 gd = (struct gf_w64_group_data *) h->private;
826 gf_w64_group_set_shift_tables(gd->shift, val, h);
828 for (i = 63; !(val & (1ULL << i)); i--) ;
831 /* i is the bit position of the first zero bit in any element of
838 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
840 gf_do_initial_region_alignment(&rd);
842 s64 = (uint64_t *) rd.s_start;
843 d64 = (uint64_t *) rd.d_start;
844 dtop = (uint64_t *) rd.d_top;
846 smask = ((uint64_t)1 << g_s) - 1;
847 rmask = ((uint64_t)1 << g_r) - 1;
853 bot = gd->shift[a64&smask];
864 tp = gd->shift[a64&smask];
865 top ^= (tp >> rshift);
866 bot ^= (tp << lshift);
871 lshift = ((i-64-1) / g_r) * g_r;
872 rshift = 64 - lshift;
873 while (lshift >= 0) {
874 tp = gd->reduce[(top >> lshift) & rmask];
875 top ^= (tp >> rshift);
876 bot ^= (tp << lshift);
882 if (xor) bot ^= *d64;
887 gf_do_final_region_alignment(&rd);
893 gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
896 uint64_t p, l, ind, a64;
900 struct gf_w64_group_data *gd;
901 gf_internal_t *h = (gf_internal_t *) gf->scratch;
904 gd = (struct gf_w64_group_data *) h->private;
905 gf_w64_group_set_shift_tables(gd->shift, b, h);
908 if (leftover == 0) leftover = g_s;
919 while (bits_left > 0) {
924 p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
930 void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
933 uint64_t p, l, ind, a64;
937 uint64_t *s64, *d64, *top;
938 struct gf_w64_group_data *gd;
939 gf_internal_t *h = (gf_internal_t *) gf->scratch;
941 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
942 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
944 gd = (struct gf_w64_group_data *) h->private;
946 gf_w64_group_set_shift_tables(gd->shift, val, h);
948 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
949 gf_do_initial_region_alignment(&rd);
951 s64 = (uint64_t *) rd.s_start;
952 d64 = (uint64_t *) rd.d_start;
953 top = (uint64_t *) rd.d_top;
956 if (leftover == 0) leftover = g_s;
968 while (bits_left > 0) {
973 p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
980 gf_do_final_region_alignment(&rd);
985 int gf_w64_group_init(gf_t *gf)
987 uint64_t i, j, p, index;
988 struct gf_w64_group_data *gd;
989 gf_internal_t *h = (gf_internal_t *) gf->scratch;
995 gd = (struct gf_w64_group_data *) h->private;
996 gd->shift = (uint64_t *) (&(gd->memory));
997 gd->reduce = gd->shift + (1 << g_s);
1000 for (i = 0; i < ((uint64_t)1 << g_r); i++) {
1003 for (j = 0; j < g_r; j++) {
1005 p ^= (h->prim_poly << j);
1007 if (j > 0) index ^= (h->prim_poly >> (64-j));
1010 gd->reduce[index] = p;
1014 SET_FUNCTION(gf,multiply,w64,gf_w64_group_s_equals_r_multiply)
1015 SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_s_equals_r_multiply_region)
1017 SET_FUNCTION(gf,multiply,w64,gf_w64_group_multiply)
1018 SET_FUNCTION(gf,multiply_region,w64,gf_w64_group_multiply_region)
1020 SET_FUNCTION(gf,divide,w64,NULL)
1021 SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
1027 gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index)
1031 r64 = (uint64_t *) start;
1037 gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
1042 uint64_t a, b, *r64;
1045 h = (gf_internal_t *) gf->scratch;
1046 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
1047 r64 = (uint64_t *) start;
1048 if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
1049 if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
1050 index -= (((uint64_t *) rd.d_start) - r64);
1051 r8 = (uint8_t *) rd.d_start;
1052 top = (uint8_t *) rd.d_top;
1053 sub_size = (top-r8)/2;
1055 a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
1056 b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
1057 return (a | ((uint64_t)b << 32));
1061 gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index)
1068 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128);
1069 r64 = (uint64_t *) start;
1070 if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
1071 if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
1072 index -= (((uint64_t *) rd.d_start) - r64);
1073 r8 = (uint8_t *) rd.d_start;
1074 r8 += ((index & 0xfffffff0)*8);
1075 r8 += (index & 0xf);
1078 for (i = 0; i < 8; i++) {
1089 gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1091 uint64_t prod, pp, bmask;
1094 h = (gf_internal_t *) gf->scratch;
1098 bmask = 0x8000000000000000ULL;
1101 if (a & 1) prod ^= b;
1103 if (a == 0) return prod;
1105 b = ((b << 1) ^ pp);
1115 gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1117 uint64_t prod, pp, pmask, amask;
1120 h = (gf_internal_t *) gf->scratch;
1125 /* changed from declare then shift to just declare.*/
1127 pmask = 0x8000000000000000ULL;
1128 amask = 0x8000000000000000ULL;
1130 while (amask != 0) {
1132 prod = ((prod << 1) ^ pp);
1136 if (a & amask) prod ^= b;
1144 gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1146 uint64_t *s64, *d64, ta, prod, amask, pmask, pp;
1150 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1151 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1153 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1154 gf_do_initial_region_alignment(&rd);
1156 h = (gf_internal_t *) gf->scratch;
1158 s64 = (uint64_t *) rd.s_start;
1159 d64 = (uint64_t *) rd.d_start;
1165 while (s64 < (uint64_t *) rd.s_top) {
1169 while (amask != 0) {
1170 prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
1171 if (val & amask) prod ^= ta;
1179 while (s64 < (uint64_t *) rd.s_top) {
1183 while (amask != 0) {
1184 prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
1185 if (val & amask) prod ^= ta;
1193 gf_do_final_region_alignment(&rd);
1198 gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1200 uint64_t *s64, *d64, ta, tb, prod, bmask, pp;
1204 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1205 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1207 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1208 gf_do_initial_region_alignment(&rd);
1210 h = (gf_internal_t *) gf->scratch;
1212 s64 = (uint64_t *) rd.s_start;
1213 d64 = (uint64_t *) rd.d_start;
1219 while (s64 < (uint64_t *) rd.s_top) {
1224 if (tb & 1) prod ^= ta;
1227 ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
1234 while (s64 < (uint64_t *) rd.s_top) {
1239 if (tb & 1) prod ^= ta;
1242 ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
1249 gf_do_final_region_alignment(&rd);
1252 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
1253 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
1254 t2 = _mm_and_si128(va, m2); \
1255 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
1256 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
1258 #define BYTWO_P_ONESTEP {\
1259 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1260 t1 = _mm_and_si128(v, one); \
1261 t1 = _mm_sub_epi64(t1, one); \
1262 t1 = _mm_and_si128(t1, ta); \
1263 prod = _mm_xor_si128(prod, t1); \
1264 v = _mm_srli_epi64(v, 1); }
1268 void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1272 uint64_t vrev, one64;
1274 __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1278 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1279 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1281 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1282 gf_do_initial_region_alignment(&rd);
1284 h = (gf_internal_t *) gf->scratch;
1287 for (i = 0; i < 64; i++) {
1289 if (!(val & (one64 << i))) vrev |= 1;
1292 s8 = (uint8_t *) rd.s_start;
1293 d8 = (uint8_t *) rd.d_start;
1297 pp = _mm_set1_epi64x(h->prim_poly);
1298 m1 = _mm_set1_epi64x(amask);
1299 m2 = _mm_set1_epi64x(one64 << 63);
1300 one = _mm_set1_epi64x(1);
1302 while (d8 < (uint8_t *) rd.d_top) {
1303 prod = _mm_setzero_si128();
1304 v = _mm_set1_epi64x(vrev);
1305 ta = _mm_load_si128((__m128i *) s8);
1306 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1307 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1308 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1309 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1310 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1311 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1312 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1313 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1314 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1315 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1316 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1317 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1318 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1319 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1320 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1321 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1322 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1323 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1327 gf_do_final_region_alignment(&rd);
1334 gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
1336 uint64_t one64, amask;
1338 __m128i pp, m1, m2, t1, t2, va, vb;
1341 s8 = (uint8_t *) rd->s_start;
1342 d8 = (uint8_t *) rd->d_start;
1344 h = (gf_internal_t *) rd->gf->scratch;
1348 pp = _mm_set1_epi64x(h->prim_poly);
1349 m1 = _mm_set1_epi64x(amask);
1350 m2 = _mm_set1_epi64x(one64 << 63);
1352 while (d8 < (uint8_t *) rd->d_top) {
1353 va = _mm_load_si128 ((__m128i *)(s8));
1354 SSE_AB2(pp, m1, m2, va, t1, t2);
1355 vb = _mm_load_si128 ((__m128i *)(d8));
1356 vb = _mm_xor_si128(vb, va);
1357 _mm_store_si128((__m128i *)d8, vb);
1367 gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
1369 uint64_t one64, amask;
1371 __m128i pp, m1, m2, t1, t2, va;
1374 s8 = (uint8_t *) rd->s_start;
1375 d8 = (uint8_t *) rd->d_start;
1377 h = (gf_internal_t *) rd->gf->scratch;
1381 pp = _mm_set1_epi64x(h->prim_poly);
1382 m1 = _mm_set1_epi64x(amask);
1383 m2 = _mm_set1_epi64x(one64 << 63);
1385 while (d8 < (uint8_t *) rd->d_top) {
1386 va = _mm_load_si128 ((__m128i *)(s8));
1387 SSE_AB2(pp, m1, m2, va, t1, t2);
1388 _mm_store_si128((__m128i *)d8, va);
1398 gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1400 uint64_t itb, amask, one64;
1402 __m128i pp, m1, m2, t1, t2, va, vb;
1406 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1407 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1409 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1410 gf_do_initial_region_alignment(&rd);
1414 gf_w64_bytwo_b_sse_region_2_xor(&rd);
1416 gf_w64_bytwo_b_sse_region_2_noxor(&rd);
1418 gf_do_final_region_alignment(&rd);
1422 s8 = (uint8_t *) rd.s_start;
1423 d8 = (uint8_t *) rd.d_start;
1424 h = (gf_internal_t *) gf->scratch;
1429 pp = _mm_set1_epi64x(h->prim_poly);
1430 m1 = _mm_set1_epi64x(amask);
1431 m2 = _mm_set1_epi64x(one64 << 63);
1433 while (d8 < (uint8_t *) rd.d_top) {
1434 va = _mm_load_si128 ((__m128i *)(s8));
1435 vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1438 if (itb & 1) vb = _mm_xor_si128(vb, va);
1440 if (itb == 0) break;
1441 SSE_AB2(pp, m1, m2, va, t1, t2);
1443 _mm_store_si128((__m128i *)d8, vb);
1448 gf_do_final_region_alignment(&rd);
1454 int gf_w64_bytwo_init(gf_t *gf)
1458 h = (gf_internal_t *) gf->scratch;
1460 if (h->mult_type == GF_MULT_BYTWO_p) {
1461 SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
1463 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1464 SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_sse_multiply_region)
1467 SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_p_nosse_multiply_region)
1468 if(h->region_type & GF_REGION_SIMD)
1474 SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_b_multiply)
1476 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1477 SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_sse_multiply_region)
1480 SET_FUNCTION(gf,multiply_region,w64,gf_w64_bytwo_b_nosse_multiply_region)
1481 if(h->region_type & GF_REGION_SIMD)
1487 SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
1494 gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1496 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1497 gf_t *base_gf = h->base_gf;
1498 uint32_t b0 = b & 0x00000000ffffffff;
1499 uint32_t b1 = (b & 0xffffffff00000000) >> 32;
1500 uint32_t a0 = a & 0x00000000ffffffff;
1501 uint32_t a1 = (a & 0xffffffff00000000) >> 32;
1504 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1506 return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1507 ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
1511 * Composite field division trick (explained in 2007 tech report)
1513 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
1517 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
1519 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
1521 * let d = b1c1 and d+1 = b0c0
1523 * solve s*b1c1+b1c0+b0c1 = 0
1525 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
1535 gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
1537 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1538 gf_t *base_gf = h->base_gf;
1539 uint32_t a0 = a & 0x00000000ffffffff;
1540 uint32_t a1 = (a & 0xffffffff00000000) >> 32;
1541 uint32_t c0, c1, d, tmp;
1543 uint32_t a0inv, a1inv;
1546 a1inv = base_gf->inverse.w32(base_gf, a1);
1547 c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
1549 } else if (a1 == 0) {
1550 c0 = base_gf->inverse.w32(base_gf, a0);
1553 a1inv = base_gf->inverse.w32(base_gf, a1);
1554 a0inv = base_gf->inverse.w32(base_gf, a0);
1556 d = base_gf->multiply.w32(base_gf, a1, a0inv);
1558 tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
1559 tmp = base_gf->inverse.w32(base_gf, tmp);
1561 d = base_gf->multiply.w32(base_gf, d, tmp);
1563 c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
1564 c1 = base_gf->multiply.w32(base_gf, d, a1inv);
1567 c = c0 | ((uint64_t)c1 << 32);
1574 gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1576 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1577 gf_t *base_gf = h->base_gf;
1578 uint32_t b0 = val & 0x00000000ffffffff;
1579 uint32_t b1 = (val & 0xffffffff00000000) >> 32;
1580 uint64_t *s64, *d64;
1582 uint64_t a0, a1, a1b1;
1585 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1586 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1594 a0 = *s64 & 0x00000000ffffffff;
1595 a1 = (*s64 & 0xffffffff00000000) >> 32;
1596 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1598 *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1599 ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
1605 a0 = *s64 & 0x00000000ffffffff;
1606 a1 = (*s64 & 0xffffffff00000000) >> 32;
1607 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1609 *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1610 ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
1619 gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
1621 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1622 gf_t *base_gf = h->base_gf;
1623 gf_val_32_t val0 = val & 0x00000000ffffffff;
1624 gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32;
1625 uint8_t *slow, *shigh;
1626 uint8_t *dlow, *dhigh, *top;
1631 memset(dest, 0, bytes);
1634 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1635 gf_do_initial_region_alignment(&rd);
1637 slow = (uint8_t *) rd.s_start;
1638 dlow = (uint8_t *) rd.d_start;
1639 top = (uint8_t*) rd.d_top;
1640 sub_reg_size = (top - dlow)/2;
1641 shigh = slow + sub_reg_size;
1642 dhigh = dlow + sub_reg_size;
1644 base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
1645 base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
1646 base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
1647 base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
1648 base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
1650 gf_do_final_region_alignment(&rd);
1656 int gf_w64_composite_init(gf_t *gf)
1658 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1660 if (h->region_type & GF_REGION_ALTMAP) {
1661 SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region_alt)
1663 SET_FUNCTION(gf,multiply_region,w64,gf_w64_composite_multiply_region)
1666 SET_FUNCTION(gf,multiply,w64,gf_w64_composite_multiply)
1667 SET_FUNCTION(gf,divide,w64,NULL)
1668 SET_FUNCTION(gf,inverse,w64,gf_w64_composite_inverse)
1676 gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
1680 uint64_t pp, v, *s64, *d64, *top;
1681 __m128i si, tables[16][8], p[8], v0, mask1;
1682 struct gf_split_4_64_lazy_data *ld;
1686 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1687 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1689 h = (gf_internal_t *) gf->scratch;
1692 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
1693 gf_do_initial_region_alignment(&rd);
1695 s64 = (uint64_t *) rd.s_start;
1696 d64 = (uint64_t *) rd.d_start;
1697 top = (uint64_t *) rd.d_top;
1699 ld = (struct gf_split_4_64_lazy_data *) h->private;
1702 for (i = 0; i < 16; i++) {
1703 ld->tables[i][0] = 0;
1704 for (j = 1; j < 16; j <<= 1) {
1705 for (k = 0; k < j; k++) {
1706 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1708 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1710 for (j = 0; j < 8; j++) {
1711 for (k = 0; k < 16; k++) {
1712 btable[k] = (uint8_t) ld->tables[i][k];
1713 ld->tables[i][k] >>= 8;
1715 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1719 mask1 = _mm_set1_epi8(0xf);
1721 while (d64 != top) {
1724 for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
1726 for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
1729 for (k = 0; k < 8; k++) {
1730 v0 = _mm_load_si128((__m128i *) s64);
1731 /* MM_PRINT8("v", v0); */
1734 si = _mm_and_si128(v0, mask1);
1736 for (j = 0; j < 8; j++) {
1737 p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1740 v0 = _mm_srli_epi32(v0, 4);
1741 si = _mm_and_si128(v0, mask1);
1742 for (j = 0; j < 8; j++) {
1743 p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1747 for (i = 0; i < 8; i++) {
1748 /* MM_PRINT8("v", p[i]); */
1749 _mm_store_si128((__m128i *) d64, p[i]);
1753 gf_do_final_region_alignment(&rd);
1760 gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
1764 uint64_t pp, v, *s64, *d64, *top;
1765 __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1;
1766 struct gf_split_4_64_lazy_data *ld;
1770 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1771 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1773 h = (gf_internal_t *) gf->scratch;
1776 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
1777 gf_do_initial_region_alignment(&rd);
1779 s64 = (uint64_t *) rd.s_start;
1780 d64 = (uint64_t *) rd.d_start;
1781 top = (uint64_t *) rd.d_top;
1783 ld = (struct gf_split_4_64_lazy_data *) h->private;
1786 for (i = 0; i < 16; i++) {
1787 ld->tables[i][0] = 0;
1788 for (j = 1; j < 16; j <<= 1) {
1789 for (k = 0; k < j; k++) {
1790 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1792 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1794 for (j = 0; j < 8; j++) {
1795 for (k = 0; k < 16; k++) {
1796 btable[k] = (uint8_t) ld->tables[i][k];
1797 ld->tables[i][k] >>= 8;
1799 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1803 mask1 = _mm_set1_epi8(0xf);
1804 mask8 = _mm_set1_epi16(0xff);
1805 mask16 = _mm_set1_epi32(0xffff);
1807 while (d64 != top) {
1809 for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
1811 for (k = 0; k < 8; k++) {
1812 st[k] = _mm_load_si128((__m128i *) s64);
1816 for (k = 0; k < 4; k ++) {
1817 st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
1818 st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
1819 t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
1820 st[k] = _mm_srli_si128(st[k], 8);
1821 st[k+4] = _mm_slli_si128(st[k+4], 8);
1822 st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
1827 printf("After pack pass 1\n");
1828 for (k = 0; k < 8; k++) {
1829 MM_PRINT8("v", st[k]);
1834 t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
1835 st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
1837 t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
1838 st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
1840 t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
1841 st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
1843 t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
1844 st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
1848 printf("After pack pass 2\n");
1849 for (k = 0; k < 8; k++) {
1850 MM_PRINT8("v", st[k]);
1854 t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
1855 st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
1857 t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
1858 st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
1860 t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
1861 st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
1863 t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
1864 st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
1868 printf("After final pack pass 2\n");
1869 for (k = 0; k < 8; k++) {
1870 MM_PRINT8("v", st[k]);
1874 for (k = 0; k < 8; k++) {
1875 si = _mm_and_si128(st[k], mask1);
1877 for (j = 0; j < 8; j++) {
1878 p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1881 st[k] = _mm_srli_epi32(st[k], 4);
1882 si = _mm_and_si128(st[k], mask1);
1883 for (j = 0; j < 8; j++) {
1884 p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
1889 t1 = _mm_unpacklo_epi8(p[0], p[1]);
1890 p[1] = _mm_unpackhi_epi8(p[0], p[1]);
1892 t1 = _mm_unpacklo_epi8(p[2], p[3]);
1893 p[3] = _mm_unpackhi_epi8(p[2], p[3]);
1895 t1 = _mm_unpacklo_epi8(p[4], p[5]);
1896 p[5] = _mm_unpackhi_epi8(p[4], p[5]);
1898 t1 = _mm_unpacklo_epi8(p[6], p[7]);
1899 p[7] = _mm_unpackhi_epi8(p[6], p[7]);
1903 printf("After unpack pass 1:\n");
1904 for (i = 0; i < 8; i++) {
1905 MM_PRINT8("v", p[i]);
1909 t1 = _mm_unpacklo_epi16(p[0], p[2]);
1910 p[2] = _mm_unpackhi_epi16(p[0], p[2]);
1912 t1 = _mm_unpacklo_epi16(p[1], p[3]);
1913 p[3] = _mm_unpackhi_epi16(p[1], p[3]);
1915 t1 = _mm_unpacklo_epi16(p[4], p[6]);
1916 p[6] = _mm_unpackhi_epi16(p[4], p[6]);
1918 t1 = _mm_unpacklo_epi16(p[5], p[7]);
1919 p[7] = _mm_unpackhi_epi16(p[5], p[7]);
1923 printf("After unpack pass 2:\n");
1924 for (i = 0; i < 8; i++) {
1925 MM_PRINT8("v", p[i]);
1929 t1 = _mm_unpacklo_epi32(p[0], p[4]);
1930 p[4] = _mm_unpackhi_epi32(p[0], p[4]);
1932 t1 = _mm_unpacklo_epi32(p[1], p[5]);
1933 p[5] = _mm_unpackhi_epi32(p[1], p[5]);
1935 t1 = _mm_unpacklo_epi32(p[2], p[6]);
1936 p[6] = _mm_unpackhi_epi32(p[2], p[6]);
1938 t1 = _mm_unpacklo_epi32(p[3], p[7]);
1939 p[7] = _mm_unpackhi_epi32(p[3], p[7]);
1943 for (i = 0; i < 8; i++) {
1944 t1 = _mm_load_si128((__m128i *) d64);
1945 _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
1949 for (i = 0; i < 8; i++) {
1950 _mm_store_si128((__m128i *) d64, p[i]);
1957 gf_do_final_region_alignment(&rd);
1961 #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);
1964 int gf_w64_split_init(gf_t *gf)
1967 struct gf_split_4_64_lazy_data *d4;
1968 struct gf_split_8_64_lazy_data *d8;
1969 struct gf_split_8_8_data *d88;
1970 struct gf_split_16_64_lazy_data *d16;
1974 h = (gf_internal_t *) gf->scratch;
1978 SET_FUNCTION(gf,multiply_region,w64,gf_w64_multiply_region_from_single)
1980 SET_FUNCTION(gf,multiply,w64,gf_w64_bytwo_p_multiply)
1982 #if defined(INTEL_SSE4_PCLMUL)
1983 if (gf_cpu_supports_intel_pclmul) {
1984 if ((!(h->region_type & GF_REGION_NOSIMD) &&
1985 (h->arg1 == 64 || h->arg2 == 64)) ||
1986 h->mult_type == GF_MULT_DEFAULT){
1988 if ((0xfffffffe00000000ULL & h->prim_poly) == 0){
1989 SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_2)
1990 SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_2)
1991 }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
1992 SET_FUNCTION(gf,multiply,w64,gf_w64_clm_multiply_4)
1993 SET_FUNCTION(gf,multiply_region,w64,gf_w64_clm_multiply_region_from_single_4)
2001 SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
2003 /* Allen: set region pointers for default mult type. Single pointers are
2004 * taken care of above (explicitly for sse, implicitly for no sse). */
2006 if (h->mult_type == GF_MULT_DEFAULT) {
2007 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2008 if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
2009 d4 = (struct gf_split_4_64_lazy_data *) h->private;
2011 #if defined(INTEL_SSE4)
2012 if (gf_cpu_supports_intel_sse4)
2013 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
2014 #elif defined(ARCH_AARCH64)
2015 if (gf_cpu_supports_arm_neon)
2016 gf_w64_neon_split_init(gf);
2020 d8 = (struct gf_split_8_64_lazy_data *) h->private;
2022 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
2023 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2028 if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
2029 d4 = (struct gf_split_4_64_lazy_data *) h->private;
2032 if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
2033 if(h->region_type & GF_REGION_ALTMAP)
2036 if (gf_cpu_supports_intel_ssse3) {
2037 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_altmap_multiply_region)
2039 #elif defined(ARCH_AARCH64)
2040 if (gf_cpu_supports_arm_neon) {
2041 gf_w64_neon_split_init(gf);
2048 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2049 if(gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
2050 if (h->region_type & GF_REGION_NOSIMD) {
2051 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
2053 #if defined(INTEL_SSE4)
2054 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_sse_multiply_region)
2055 #elif defined(ARCH_AARCH64)
2056 gf_w64_neon_split_init(gf);
2060 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region)
2061 if(h->region_type & GF_REGION_SIMD)
2063 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2068 if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
2069 d8 = (struct gf_split_8_64_lazy_data *) h->private;
2071 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_8_64_lazy_multiply_region)
2073 if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) {
2074 d16 = (struct gf_split_16_64_lazy_data *) h->private;
2075 d16->last_value = 0;
2076 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_16_64_lazy_multiply_region)
2078 if ((h->arg1 == 8 && h->arg2 == 8)) {
2079 d88 = (struct gf_split_8_8_data *) h->private;
2080 SET_FUNCTION(gf,multiply,w64,gf_w64_split_8_8_multiply)
2082 /* The performance of this guy sucks, so don't bother with a region op */
2085 for (exp = 0; exp < 15; exp++) {
2086 for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
2087 for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0;
2088 d88->tables[exp][1][1] = basep;
2089 for (i = 2; i < 256; i++) {
2091 p = d88->tables[exp][i^1][1];
2092 d88->tables[exp][i][1] = p ^ basep;
2094 p = d88->tables[exp][i>>1][1];
2095 d88->tables[exp][i][1] = GF_MULTBY_TWO(p);
2098 for (i = 1; i < 256; i++) {
2099 p = d88->tables[exp][i][1];
2100 for (j = 1; j < 256; j++) {
2102 d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p;
2104 d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]);
2108 for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
2114 int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2119 return sizeof(gf_internal_t);
2121 case GF_MULT_CARRY_FREE:
2122 return sizeof(gf_internal_t);
2124 case GF_MULT_BYTWO_p:
2125 case GF_MULT_BYTWO_b:
2126 return sizeof(gf_internal_t);
2129 case GF_MULT_DEFAULT:
2131 /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
2132 * then fall through to split table scratch size code. */
2134 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2135 if (gf_cpu_supports_intel_sse4 || gf_cpu_supports_arm_neon) {
2142 #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2146 case GF_MULT_SPLIT_TABLE:
2147 if (arg1 == 8 && arg2 == 8) {
2148 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
2150 if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
2151 return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
2153 if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
2154 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
2157 if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
2158 return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
2162 return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
2163 sizeof(uint64_t) * (1 << arg1) +
2164 sizeof(uint64_t) * (1 << arg2) + 64;
2166 case GF_MULT_COMPOSITE:
2167 if (arg1 == 2) return sizeof(gf_internal_t) + 64;
2175 int gf_w64_init(gf_t *gf)
2179 h = (gf_internal_t *) gf->scratch;
2181 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2183 /* Omitting the leftmost 1 as in w=32 */
2185 if (h->prim_poly == 0) {
2186 if (h->mult_type == GF_MULT_COMPOSITE) {
2187 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2188 if (h->prim_poly == 0) return 0; /* This shouldn't happen */
2190 h->prim_poly = 0x1b;
2194 SET_FUNCTION(gf,multiply,w64,NULL)
2195 SET_FUNCTION(gf,divide,w64,NULL)
2196 SET_FUNCTION(gf,inverse,w64,NULL)
2197 SET_FUNCTION(gf,multiply_region,w64,NULL)
2199 switch(h->mult_type) {
2200 case GF_MULT_CARRY_FREE: if (gf_w64_cfm_init(gf) == 0) return 0; break;
2201 case GF_MULT_SHIFT: if (gf_w64_shift_init(gf) == 0) return 0; break;
2202 case GF_MULT_COMPOSITE: if (gf_w64_composite_init(gf) == 0) return 0; break;
2203 case GF_MULT_DEFAULT:
2204 case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break;
2205 case GF_MULT_GROUP: if (gf_w64_group_init(gf) == 0) return 0; break;
2206 case GF_MULT_BYTWO_p:
2207 case GF_MULT_BYTWO_b: if (gf_w64_bytwo_init(gf) == 0) return 0; break;
2210 if (h->divide_type == GF_DIVIDE_EUCLID) {
2211 SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse)
2212 SET_FUNCTION(gf,inverse,w64,gf_w64_euclid)
2215 if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
2216 SET_FUNCTION(gf,divide,w64,gf_w64_divide_from_inverse)
2218 if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) {
2219 SET_FUNCTION(gf,inverse,w64,gf_w64_inverse_from_divide)
2222 if (h->region_type == GF_REGION_CAUCHY) return 0;
2224 if (h->region_type & GF_REGION_ALTMAP) {
2225 if (h->mult_type == GF_MULT_COMPOSITE) {
2226 SET_FUNCTION(gf,extract_word,w64,gf_w64_composite_extract_word)
2227 } else if (h->mult_type == GF_MULT_SPLIT_TABLE) {
2228 SET_FUNCTION(gf,extract_word,w64,gf_w64_split_extract_word)
2231 SET_FUNCTION(gf,extract_word,w64,gf_w64_extract_word)