2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Routines for 32-bit Galois fields
18 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
20 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
22 #define AB2(ip, am1 ,am2, b, t1, t2) {\
25 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
26 b = (t1 ^ (t2 & ip));}
28 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
29 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
30 t2 = _mm_and_si128(va, m2); \
31 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
32 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
36 uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a)
38 return gf->divide.w32(gf, 1, a);
43 uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
45 b = gf->inverse.w32(gf, b);
46 return gf->multiply.w32(gf, a, b);
51 gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int
58 s32 = (uint32_t *) src;
59 d32 = (uint32_t *) dest;
62 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
63 d32[i] ^= gf->multiply.w32(gf, val, s32[i]);
66 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
67 d32[i] = gf->multiply.w32(gf, val, s32[i]);
72 #if defined(INTEL_SSE4_PCLMUL)
76 gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
87 gf_internal_t * h = gf->scratch;
89 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
91 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
92 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
94 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
95 s32 = (uint32_t *) src;
96 d32 = (uint32_t *) dest;
99 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
100 b = _mm_insert_epi32 (a, s32[i], 0);
101 result = _mm_clmulepi64_si128 (a, b, 0);
102 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
103 result = _mm_xor_si128 (result, w);
104 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
105 result = _mm_xor_si128 (result, w);
106 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
109 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
110 b = _mm_insert_epi32 (a, s32[i], 0);
111 result = _mm_clmulepi64_si128 (a, b, 0);
112 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
113 result = _mm_xor_si128 (result, w);
114 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
115 result = _mm_xor_si128 (result, w);
116 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
122 #if defined(INTEL_SSE4_PCLMUL)
126 gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
137 gf_internal_t * h = gf->scratch;
139 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
141 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
142 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
144 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
146 s32 = (uint32_t *) src;
147 d32 = (uint32_t *) dest;
150 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
151 b = _mm_insert_epi32 (a, s32[i], 0);
152 result = _mm_clmulepi64_si128 (a, b, 0);
153 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
154 result = _mm_xor_si128 (result, w);
155 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
156 result = _mm_xor_si128 (result, w);
157 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
158 result = _mm_xor_si128 (result, w);
159 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
162 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
163 b = _mm_insert_epi32 (a, s32[i], 0);
164 result = _mm_clmulepi64_si128 (a, b, 0);
165 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
166 result = _mm_xor_si128 (result, w);
167 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
168 result = _mm_xor_si128 (result, w);
169 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
170 result = _mm_xor_si128 (result, w);
171 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
177 #if defined(INTEL_SSE4_PCLMUL)
180 gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
190 gf_internal_t * h = gf->scratch;
192 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
194 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
195 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
197 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
199 s32 = (uint32_t *) src;
200 d32 = (uint32_t *) dest;
203 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
204 b = _mm_insert_epi32 (a, s32[i], 0);
205 result = _mm_clmulepi64_si128 (a, b, 0);
206 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
207 result = _mm_xor_si128 (result, w);
208 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
209 result = _mm_xor_si128 (result, w);
210 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
211 result = _mm_xor_si128 (result, w);
212 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
213 result = _mm_xor_si128 (result, w);
214 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
217 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
218 b = _mm_insert_epi32 (a, s32[i], 0);
219 result = _mm_clmulepi64_si128 (a, b, 0);
220 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
221 result = _mm_xor_si128 (result, w);
222 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
223 result = _mm_xor_si128 (result, w);
224 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
225 result = _mm_xor_si128 (result, w);
226 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
227 result = _mm_xor_si128 (result, w);
228 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
236 uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
238 uint32_t e_i, e_im1, e_ip1;
239 uint32_t d_i, d_im1, d_ip1;
240 uint32_t y_i, y_im1, y_ip1;
243 if (b == 0) return -1;
244 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
247 for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
257 while (d_ip1 >= d_i) {
258 c_i ^= (1 << (d_ip1 - d_i));
259 e_ip1 ^= (e_i << (d_ip1 - d_i));
261 if (e_ip1 == 0) return 0;
262 while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
265 y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
279 gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index)
283 r32 = (uint32_t *) start;
289 gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
297 h = (gf_internal_t *) gf->scratch;
298 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
299 r32 = (uint32_t *) start;
300 if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
301 if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
302 index -= (((uint32_t *) rd.d_start) - r32);
303 r8 = (uint8_t *) rd.d_start;
304 top = (uint8_t *) rd.d_top;
305 sub_size = (top-r8)/2;
307 a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
308 b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
309 return (a | (b << 16));
313 gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index)
320 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
321 r32 = (uint32_t *) start;
322 if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
323 if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
324 index -= (((uint32_t *) rd.d_start) - r32);
325 r8 = (uint8_t *) rd.d_start;
326 r8 += ((index & 0xfffffff0)*4);
330 for (i = 0; i < 4; i++) {
341 uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
343 return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly);
346 /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
347 include it for completeness. It does have the feature that it requires no
351 #if defined(INTEL_SSE4_PCLMUL)
356 gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
364 gf_internal_t * h = gf->scratch;
365 uint64_t g_star, q_plus;
367 q_plus = *(uint64_t *) h->private;
368 g_star = *((uint64_t *) h->private + 1);
370 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
371 b = _mm_insert_epi32 (a, b32, 0);
372 g = _mm_insert_epi64 (a, g_star, 0);
373 q = _mm_insert_epi64 (a, q_plus, 0);
375 result = _mm_clmulepi64_si128 (a, b, 0);
376 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
377 w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
378 result = _mm_xor_si128 (result, w);
380 /* Extracts 32 bit value from result. */
381 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
386 #if defined(INTEL_SSE4_PCLMUL)
390 gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
401 gf_internal_t * h = gf->scratch;
402 uint64_t g_star, q_plus;
404 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
405 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
407 q_plus = *(uint64_t *) h->private;
408 g_star = *((uint64_t *) h->private + 1);
410 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
411 g = _mm_insert_epi64 (a, g_star, 0);
412 q = _mm_insert_epi64 (a, q_plus, 0);
413 s32 = (uint32_t *) src;
414 d32 = (uint32_t *) dest;
417 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
418 b = _mm_insert_epi32 (a, s32[i], 0);
419 result = _mm_clmulepi64_si128 (a, b, 0);
420 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
421 w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
422 result = _mm_xor_si128 (result, w);
423 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
426 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
427 b = _mm_insert_epi32 (a, s32[i], 0);
428 result = _mm_clmulepi64_si128 (a, b, 0);
429 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
430 w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
431 result = _mm_xor_si128 (result, w);
432 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
439 #if defined(INTEL_SSE4_PCLMUL)
444 gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
452 gf_internal_t * h = gf->scratch;
455 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
456 b = _mm_insert_epi32 (a, b32, 0);
458 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
460 /* Do the initial multiply */
462 result = _mm_clmulepi64_si128 (a, b, 0);
464 /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
465 have to do the reduction at most twice, because (w-2)/z == 2. Where
466 z is equal to the number of zeros after the leading 1
468 _mm_clmulepi64_si128 is the carryless multiply operation. Here
469 _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
470 us to multiply the prim_poly by the leading bits of the result. We
471 then xor the result of that operation back with the result.*/
473 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
474 result = _mm_xor_si128 (result, w);
475 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
476 result = _mm_xor_si128 (result, w);
478 /* Extracts 32 bit value from result. */
479 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
484 #if defined(INTEL_SSE4_PCLMUL)
489 gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
497 gf_internal_t * h = gf->scratch;
500 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
501 b = _mm_insert_epi32 (a, b32, 0);
503 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
505 /* Do the initial multiply */
507 result = _mm_clmulepi64_si128 (a, b, 0);
509 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
510 result = _mm_xor_si128 (result, w);
511 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
512 result = _mm_xor_si128 (result, w);
513 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
514 result = _mm_xor_si128 (result, w);
516 /* Extracts 32 bit value from result. */
518 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
523 #if defined(INTEL_SSE4_PCLMUL)
528 gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
536 gf_internal_t * h = gf->scratch;
539 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
540 b = _mm_insert_epi32 (a, b32, 0);
542 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
544 /* Do the initial multiply */
546 result = _mm_clmulepi64_si128 (a, b, 0);
548 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
549 result = _mm_xor_si128 (result, w);
550 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
551 result = _mm_xor_si128 (result, w);
552 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
553 result = _mm_xor_si128 (result, w);
554 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
555 result = _mm_xor_si128 (result, w);
557 /* Extracts 32 bit value from result. */
559 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
568 gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
570 uint64_t product, i, pp, a, b, one;
575 h = (gf_internal_t *) gf->scratch;
577 pp = h->prim_poly | (one << 32);
581 for (i = 0; i < GF_FIELD_WIDTH; i++) {
582 if (a & (one << i)) product ^= (b << i);
584 for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
585 if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
591 int gf_w32_cfmgk_init(gf_t *gf)
593 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
594 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
596 #if defined(INTEL_SSE4_PCLMUL)
597 if (gf_cpu_supports_intel_pclmul) {
600 h = (gf_internal_t *) gf->scratch;
601 SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
602 SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
604 uint64_t *q_plus = (uint64_t *) h->private;
605 uint64_t *g_star = (uint64_t *) h->private + 1;
607 uint64_t tmp = h->prim_poly << 32;
608 *q_plus = 1ULL << 32;
611 for(i = 63; i >= 32; i--)
612 if((1ULL << i) & tmp)
614 *q_plus |= 1ULL << (i-32);
615 tmp ^= h->prim_poly << (i-32);
618 *g_star = h->prim_poly & ((1ULL << 32) - 1);
628 int gf_w32_cfm_init(gf_t *gf)
630 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
631 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
633 /*Ben: We also check to see if the prim poly will work for pclmul */
634 /*Ben: Check to see how many reduction steps it will take*/
636 #if defined(INTEL_SSE4_PCLMUL)
637 if (gf_cpu_supports_intel_pclmul) {
640 h = (gf_internal_t *) gf->scratch;
642 if ((0xfffe0000 & h->prim_poly) == 0){
643 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
644 SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
645 }else if ((0xffc00000 & h->prim_poly) == 0){
646 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
647 SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
648 }else if ((0xfe000000 & h->prim_poly) == 0){
649 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
650 SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
662 int gf_w32_shift_init(gf_t *gf)
664 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
665 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
666 SET_FUNCTION(gf,multiply,w32,gf_w32_shift_multiply)
672 gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
679 for (i = 1; i < ((uint32_t)1 << h->arg1); i <<= 1) {
680 for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
681 if (val & GF_FIRST_BIT) {
691 void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
694 uint32_t p, l, ind, a32;
698 uint32_t *s32, *d32, *top;
699 struct gf_w32_group_data *gd;
700 gf_internal_t *h = (gf_internal_t *) gf->scratch;
702 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
703 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
705 gd = (struct gf_w32_group_data *) h->private;
707 gf_w32_group_set_shift_tables(gd->shift, val, h);
709 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
710 gf_do_initial_region_alignment(&rd);
712 s32 = (uint32_t *) rd.s_start;
713 d32 = (uint32_t *) rd.d_start;
714 top = (uint32_t *) rd.d_top;
717 if (leftover == 0) leftover = g_s;
729 while (bits_left > 0) {
734 p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
741 gf_do_final_region_alignment(&rd);
745 void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
747 uint32_t *s32, *d32, *top;
753 struct gf_w32_group_data *gd;
756 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
757 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
759 gf_internal_t *h = (gf_internal_t *) gf->scratch;
762 gd = (struct gf_w32_group_data *) h->private;
763 gf_w32_group_set_shift_tables(gd->shift, val, h);
765 leftover = GF_FIELD_WIDTH % g_s;
766 if (leftover == 0) leftover = g_s;
768 gd = (struct gf_w32_group_data *) h->private;
769 gf_w32_group_set_shift_tables(gd->shift, val, h);
771 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
772 gf_do_initial_region_alignment(&rd);
774 s32 = (uint32_t *) rd.s_start;
775 d32 = (uint32_t *) rd.d_start;
776 top = (uint32_t *) rd.d_top;
780 ind = a32 >> (GF_FIELD_WIDTH - leftover);
785 i = (GF_FIELD_WIDTH - leftover);
787 ind = a32 >> (GF_FIELD_WIDTH-g_s);
794 ind = a32 >> (GF_FIELD_WIDTH-g_s);
797 for (i = gd->tshift ; i >= 0; i -= g_r) {
798 l = p & (gd->rmask << i);
799 r = gd->reduce[l >> (i+32)];
809 gf_do_final_region_alignment(&rd);
815 gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
818 uint32_t p, l, ind, a32;
822 struct gf_w32_group_data *gd;
823 gf_internal_t *h = (gf_internal_t *) gf->scratch;
826 gd = (struct gf_w32_group_data *) h->private;
827 gf_w32_group_set_shift_tables(gd->shift, b, h);
830 if (leftover == 0) leftover = g_s;
841 while (bits_left > 0) {
846 p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
854 gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
856 uint32_t p, l, ind, a32;
858 struct gf_w32_group_data *d44;
859 gf_internal_t *h = (gf_internal_t *) gf->scratch;
861 d44 = (struct gf_w32_group_data *) h->private;
862 gf_w32_group_set_shift_tables(d44->shift, b, h);
871 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
875 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
879 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
883 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
887 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
891 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
894 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
901 gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
908 struct gf_w32_group_data *gd;
910 gf_internal_t *h = (gf_internal_t *) gf->scratch;
913 gd = (struct gf_w32_group_data *) h->private;
914 gf_w32_group_set_shift_tables(gd->shift, b, h);
916 leftover = GF_FIELD_WIDTH % g_s;
917 if (leftover == 0) leftover = g_s;
920 ind = a32 >> (GF_FIELD_WIDTH - leftover);
925 i = (GF_FIELD_WIDTH - leftover);
927 ind = a32 >> (GF_FIELD_WIDTH-g_s);
934 ind = a32 >> (GF_FIELD_WIDTH-g_s);
937 for (i = gd->tshift ; i >= 0; i -= g_r) {
938 l = p & (gd->rmask << i);
939 r = gd->reduce[l >> (i+32)];
949 gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
951 uint32_t prod, pp, bmask;
954 h = (gf_internal_t *) gf->scratch;
961 if (a & 1) prod ^= b;
963 if (a == 0) return prod;
975 gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
977 uint32_t prod, pp, pmask, amask;
980 h = (gf_internal_t *) gf->scratch;
990 prod = ((prod << 1) ^ pp);
994 if (a & amask) prod ^= b;
1002 gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1004 uint64_t *s64, *d64, t1, t2, ta, prod, amask;
1006 struct gf_w32_bytwo_data *btd;
1008 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1009 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1011 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1013 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1014 gf_do_initial_region_alignment(&rd);
1016 s64 = (uint64_t *) rd.s_start;
1017 d64 = (uint64_t *) rd.d_start;
1020 while (s64 < (uint64_t *) rd.s_top) {
1024 while (amask != 0) {
1025 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1026 if (val & amask) prod ^= ta;
1034 while (s64 < (uint64_t *) rd.s_top) {
1038 while (amask != 0) {
1039 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1040 if (val & amask) prod ^= ta;
1048 gf_do_final_region_alignment(&rd);
1051 #define BYTWO_P_ONESTEP {\
1052 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1053 t1 = _mm_and_si128(v, one); \
1054 t1 = _mm_sub_epi32(t1, one); \
1055 t1 = _mm_and_si128(t1, ta); \
1056 prod = _mm_xor_si128(prod, t1); \
1057 v = _mm_srli_epi64(v, 1); }
1062 gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1067 __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1068 struct gf_w32_bytwo_data *btd;
1071 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1072 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1074 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1076 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1077 gf_do_initial_region_alignment(&rd);
1080 for (i = 0; i < 32; i++) {
1082 if (!(val & ((gf_val_32_t)1 << i))) vrev |= 1;
1085 s8 = (uint8_t *) rd.s_start;
1086 d8 = (uint8_t *) rd.d_start;
1088 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1089 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1090 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1091 one = _mm_set1_epi32(1);
1093 while (d8 < (uint8_t *) rd.d_top) {
1094 prod = _mm_setzero_si128();
1095 v = _mm_set1_epi32(vrev);
1096 ta = _mm_load_si128((__m128i *) s8);
1097 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1098 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1099 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1100 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1101 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1102 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1103 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1104 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1105 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1106 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1110 gf_do_final_region_alignment(&rd);
1116 gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1118 uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1119 struct gf_w32_bytwo_data *btd;
1122 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1123 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1125 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1126 gf_do_initial_region_alignment(&rd);
1128 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1129 s64 = (uint64_t *) rd.s_start;
1130 d64 = (uint64_t *) rd.d_start;
1135 while (d64 < (uint64_t *) rd.d_top) {
1137 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1143 while (d64 < (uint64_t *) rd.d_top) {
1145 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1154 while (d64 < (uint64_t *) rd.d_top) {
1157 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1158 *d64 ^= (ta ^ prod);
1163 while (d64 < (uint64_t *) rd.d_top) {
1166 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1175 while (d64 < (uint64_t *) rd.d_top) {
1177 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1178 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1184 while (d64 < (uint64_t *) rd.d_top) {
1186 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1187 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1196 while (d64 < (uint64_t *) rd.d_top) {
1199 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1200 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1201 *d64 ^= (ta ^ prod);
1206 while (d64 < (uint64_t *) rd.d_top) {
1209 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1210 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1219 while (d64 < (uint64_t *) rd.d_top) {
1224 if (tb & 1) prod ^= ta;
1227 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1234 while (d64 < (uint64_t *) rd.d_top) {
1239 if (tb & 1) prod ^= ta;
1242 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1251 gf_do_final_region_alignment(&rd);
1257 gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
1260 __m128i pp, m1, m2, t1, t2, va;
1262 s8 = (uint8_t *) rd->s_start;
1263 d8 = (uint8_t *) rd->d_start;
1265 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1266 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1267 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1269 while (d8 < (uint8_t *) rd->d_top) {
1270 va = _mm_load_si128 ((__m128i *)(s8));
1271 SSE_AB2(pp, m1, m2, va, t1, t2);
1272 _mm_store_si128((__m128i *)d8, va);
1282 gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
1285 __m128i pp, m1, m2, t1, t2, va, vb;
1287 s8 = (uint8_t *) rd->s_start;
1288 d8 = (uint8_t *) rd->d_start;
1290 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1291 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1292 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1294 while (d8 < (uint8_t *) rd->d_top) {
1295 va = _mm_load_si128 ((__m128i *)(s8));
1296 SSE_AB2(pp, m1, m2, va, t1, t2);
1297 vb = _mm_load_si128 ((__m128i *)(d8));
1298 vb = _mm_xor_si128(vb, va);
1299 _mm_store_si128((__m128i *)d8, vb);
1310 gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1314 __m128i pp, m1, m2, t1, t2, va, vb;
1315 struct gf_w32_bytwo_data *btd;
1318 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1319 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1321 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1322 gf_do_initial_region_alignment(&rd);
1324 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1328 gf_w32_bytwo_b_sse_region_2_xor(&rd, btd);
1330 gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd);
1332 gf_do_final_region_alignment(&rd);
1336 s8 = (uint8_t *) rd.s_start;
1337 d8 = (uint8_t *) rd.d_start;
1339 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1340 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1341 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1343 while (d8 < (uint8_t *) rd.d_top) {
1344 va = _mm_load_si128 ((__m128i *)(s8));
1345 vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1348 if (itb & 1) vb = _mm_xor_si128(vb, va);
1350 if (itb == 0) break;
1351 SSE_AB2(pp, m1, m2, va, t1, t2);
1353 _mm_store_si128((__m128i *)d8, vb);
1358 gf_do_final_region_alignment(&rd);
1363 int gf_w32_bytwo_init(gf_t *gf)
1366 uint64_t ip, m1, m2;
1367 struct gf_w32_bytwo_data *btd;
1369 h = (gf_internal_t *) gf->scratch;
1370 btd = (struct gf_w32_bytwo_data *) (h->private);
1371 ip = h->prim_poly & 0xffffffff;
1379 btd->prim_poly |= ip;
1382 ip <<= GF_FIELD_WIDTH;
1383 m1 <<= GF_FIELD_WIDTH;
1384 m2 <<= GF_FIELD_WIDTH;
1387 if (h->mult_type == GF_MULT_BYTWO_p) {
1388 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
1390 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1391 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region)
1394 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
1395 if(h->region_type & GF_REGION_SIMD)
1401 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply)
1403 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1404 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region)
1407 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
1408 if(h->region_type & GF_REGION_SIMD)
1415 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
1422 gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
1424 uint32_t product, i, j, mask, tb;
1426 struct gf_w32_split_8_8_data *d8;
1428 h = (gf_internal_t *) gf->scratch;
1429 d8 = (struct gf_w32_split_8_8_data *) h->private;
1433 for (i = 0; i < 4; i++) {
1435 for (j = 0; j < 4; j++) {
1436 product ^= d8->tables[i+j][a32&mask][tb&mask];
1447 gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1450 uint32_t *s32, *d32, *top, p, a, v;
1451 struct gf_split_8_32_lazy_data *d8;
1452 struct gf_w32_split_8_8_data *d88;
1454 int i, j, k, change;
1458 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1459 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1461 h = (gf_internal_t *) gf->scratch;
1462 if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) {
1463 d8 = (struct gf_split_8_32_lazy_data *) h->private;
1464 for (i = 0; i < 4; i++) t[i] = d8->tables[i];
1465 change = (val != d8->last_value);
1466 if (change) d8->last_value = val;
1468 d88 = (struct gf_w32_split_8_8_data *) h->private;
1469 for (i = 0; i < 4; i++) t[i] = d88->region_tables[i];
1470 change = (val != d88->last_value);
1471 if (change) d88->last_value = val;
1475 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1476 gf_do_initial_region_alignment(&rd);
1478 s32 = (uint32_t *) rd.s_start;
1479 d32 = (uint32_t *) rd.d_start;
1480 top = (uint32_t *) rd.d_top;
1484 for (i = 0; i < 4; i++) {
1486 for (j = 1; j < 256; j <<= 1) {
1487 for (k = 0; k < j; k++) {
1488 t[i][k^j] = (v ^ t[i][k]);
1490 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1496 p = (xor) ? *d32 : 0;
1509 gf_do_final_region_alignment(&rd);
1515 gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1518 uint32_t *s32, *d32, *top, p, a, v;
1519 struct gf_split_16_32_lazy_data *d16;
1521 int i, j, k, change;
1525 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1526 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1528 h = (gf_internal_t *) gf->scratch;
1529 d16 = (struct gf_split_16_32_lazy_data *) h->private;
1530 for (i = 0; i < 2; i++) t[i] = d16->tables[i];
1531 change = (val != d16->last_value);
1532 if (change) d16->last_value = val;
1536 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1537 gf_do_initial_region_alignment(&rd);
1539 s32 = (uint32_t *) rd.s_start;
1540 d32 = (uint32_t *) rd.d_start;
1541 top = (uint32_t *) rd.d_top;
1545 for (i = 0; i < 2; i++) {
1547 for (j = 1; j < (1 << 16); j <<= 1) {
1548 for (k = 0; k < j; k++) {
1549 t[i][k^j] = (v ^ t[i][k]);
1551 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1557 p = (xor) ? *d32 : 0;
1560 while (a != 0 && i < 2) {
1570 gf_do_final_region_alignment(&rd);
1575 gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1578 struct gf_split_2_32_lazy_data *ld;
1580 uint32_t pp, v, v2, s, *s32, *d32, *top;
1583 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1584 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1586 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1587 gf_do_initial_region_alignment(&rd);
1589 h = (gf_internal_t *) gf->scratch;
1592 ld = (struct gf_split_2_32_lazy_data *) h->private;
1594 if (ld->last_value != val) {
1596 for (i = 0; i < 16; i++) {
1598 if (v & GF_FIRST_BIT) v2 ^= pp;
1599 ld->tables[i][0] = 0;
1600 ld->tables[i][1] = v;
1601 ld->tables[i][2] = v2;
1602 ld->tables[i][3] = (v2 ^ v);
1604 if (v2 & GF_FIRST_BIT) v ^= pp;
1607 ld->last_value = val;
1609 s32 = (uint32_t *) rd.s_start;
1610 d32 = (uint32_t *) rd.d_start;
1611 top = (uint32_t *) rd.d_top;
1613 while (d32 != top) {
1614 v = (xor) ? *d32 : 0;
1618 v ^= ld->tables[i][s&3];
1626 gf_do_final_region_alignment(&rd);
1632 gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1636 uint32_t pp, v, v2, *s32, *d32, *top;
1637 __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2;
1640 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1641 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1643 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1644 gf_do_initial_region_alignment(&rd);
1646 h = (gf_internal_t *) gf->scratch;
1649 s32 = (uint32_t *) rd.s_start;
1650 d32 = (uint32_t *) rd.d_start;
1651 top = (uint32_t *) rd.d_top;
1654 for (i = 0; i < 16; i++) {
1656 if (v & GF_FIRST_BIT) v2 ^= pp;
1657 tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0);
1659 if (v2 & GF_FIRST_BIT) v ^= pp;
1662 shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1663 adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
1664 mask1 = _mm_set1_epi8(0x3);
1665 mask2 = _mm_set1_epi8(0xc);
1667 while (d32 != top) {
1668 pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128();
1669 vi = _mm_load_si128((__m128i *) s32);
1672 for (i = 0; i < 4; i++) {
1673 si = _mm_shuffle_epi8(vi, shuffler);
1675 xi = _mm_and_si128(si, mask1);
1676 xi = _mm_slli_epi16(xi, 2);
1677 xi = _mm_xor_si128(xi, adder);
1678 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1681 xi = _mm_and_si128(si, mask2);
1682 xi = _mm_xor_si128(xi, adder);
1683 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1684 si = _mm_srli_epi16(si, 2);
1687 xi = _mm_and_si128(si, mask2);
1688 xi = _mm_xor_si128(xi, adder);
1689 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1690 si = _mm_srli_epi16(si, 2);
1693 xi = _mm_and_si128(si, mask2);
1694 xi = _mm_xor_si128(xi, adder);
1695 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1698 vi = _mm_srli_epi32(vi, 8);
1700 _mm_store_si128((__m128i *) d32, pi);
1705 gf_do_final_region_alignment(&rd);
1712 gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1715 struct gf_split_4_32_lazy_data *ld;
1717 uint32_t pp, v, s, *s32, *d32, *top;
1720 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1721 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1723 h = (gf_internal_t *) gf->scratch;
1726 ld = (struct gf_split_4_32_lazy_data *) h->private;
1728 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1729 gf_do_initial_region_alignment(&rd);
1731 if (ld->last_value != val) {
1733 for (i = 0; i < 8; i++) {
1734 ld->tables[i][0] = 0;
1735 for (j = 1; j < 16; j <<= 1) {
1736 for (k = 0; k < j; k++) {
1737 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1739 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1743 ld->last_value = val;
1745 s32 = (uint32_t *) rd.s_start;
1746 d32 = (uint32_t *) rd.d_start;
1747 top = (uint32_t *) rd.d_top;
1749 while (d32 != top) {
1750 v = (xor) ? *d32 : 0;
1754 v ^= ld->tables[i][s&0xf];
1762 gf_do_final_region_alignment(&rd);
1768 gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1772 uint32_t pp, v, *s32, *d32, *top;
1773 __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3;
1774 struct gf_split_4_32_lazy_data *ld;
1778 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1779 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1781 h = (gf_internal_t *) gf->scratch;
1784 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
1785 gf_do_initial_region_alignment(&rd);
1787 s32 = (uint32_t *) rd.s_start;
1788 d32 = (uint32_t *) rd.d_start;
1789 top = (uint32_t *) rd.d_top;
1791 ld = (struct gf_split_4_32_lazy_data *) h->private;
1794 for (i = 0; i < 8; i++) {
1795 ld->tables[i][0] = 0;
1796 for (j = 1; j < 16; j <<= 1) {
1797 for (k = 0; k < j; k++) {
1798 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1800 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1802 for (j = 0; j < 4; j++) {
1803 for (k = 0; k < 16; k++) {
1804 btable[k] = (uint8_t) ld->tables[i][k];
1805 ld->tables[i][k] >>= 8;
1807 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1811 mask1 = _mm_set1_epi8(0xf);
1814 while (d32 != top) {
1815 p0 = _mm_load_si128 ((__m128i *) d32);
1816 p1 = _mm_load_si128 ((__m128i *) (d32+4));
1817 p2 = _mm_load_si128 ((__m128i *) (d32+8));
1818 p3 = _mm_load_si128 ((__m128i *) (d32+12));
1820 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
1821 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
1822 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
1823 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
1825 si = _mm_and_si128(v0, mask1);
1826 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
1827 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
1828 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
1829 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
1831 v0 = _mm_srli_epi32(v0, 4);
1832 si = _mm_and_si128(v0, mask1);
1833 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
1834 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
1835 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
1836 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
1838 si = _mm_and_si128(v1, mask1);
1839 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
1840 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
1841 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
1842 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
1844 v1 = _mm_srli_epi32(v1, 4);
1845 si = _mm_and_si128(v1, mask1);
1846 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
1847 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
1848 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
1849 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
1851 si = _mm_and_si128(v2, mask1);
1852 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
1853 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
1854 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
1855 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
1857 v2 = _mm_srli_epi32(v2, 4);
1858 si = _mm_and_si128(v2, mask1);
1859 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
1860 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
1861 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
1862 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
1864 si = _mm_and_si128(v3, mask1);
1865 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
1866 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
1867 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
1868 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
1870 v3 = _mm_srli_epi32(v3, 4);
1871 si = _mm_and_si128(v3, mask1);
1872 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
1873 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
1874 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
1875 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
1877 _mm_store_si128((__m128i *) d32, p0);
1878 _mm_store_si128((__m128i *) (d32+4), p1);
1879 _mm_store_si128((__m128i *) (d32+8), p2);
1880 _mm_store_si128((__m128i *) (d32+12), p3);
1884 while (d32 != top) {
1886 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
1887 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
1888 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
1889 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
1891 si = _mm_and_si128(v0, mask1);
1892 p0 = _mm_shuffle_epi8(tables[0][0], si);
1893 p1 = _mm_shuffle_epi8(tables[0][1], si);
1894 p2 = _mm_shuffle_epi8(tables[0][2], si);
1895 p3 = _mm_shuffle_epi8(tables[0][3], si);
1897 v0 = _mm_srli_epi32(v0, 4);
1898 si = _mm_and_si128(v0, mask1);
1899 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
1900 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
1901 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
1902 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
1904 si = _mm_and_si128(v1, mask1);
1905 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
1906 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
1907 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
1908 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
1910 v1 = _mm_srli_epi32(v1, 4);
1911 si = _mm_and_si128(v1, mask1);
1912 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
1913 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
1914 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
1915 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
1917 si = _mm_and_si128(v2, mask1);
1918 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
1919 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
1920 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
1921 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
1923 v2 = _mm_srli_epi32(v2, 4);
1924 si = _mm_and_si128(v2, mask1);
1925 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
1926 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
1927 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
1928 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
1930 si = _mm_and_si128(v3, mask1);
1931 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
1932 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
1933 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
1934 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
1936 v3 = _mm_srli_epi32(v3, 4);
1937 si = _mm_and_si128(v3, mask1);
1938 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
1939 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
1940 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
1941 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
1943 _mm_store_si128((__m128i *) d32, p0);
1944 _mm_store_si128((__m128i *) (d32+4), p1);
1945 _mm_store_si128((__m128i *) (d32+8), p2);
1946 _mm_store_si128((__m128i *) (d32+12), p3);
1951 gf_do_final_region_alignment(&rd);
1959 gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1963 uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
1964 __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8;
1965 __m128i tv1, tv2, tv3, tv0;
1969 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1970 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1972 h = (gf_internal_t *) gf->scratch;
1975 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
1976 gf_do_initial_region_alignment(&rd);
1978 s32 = (uint32_t *) rd.s_start;
1979 d32 = (uint32_t *) rd.d_start;
1980 top = (uint32_t *) rd.d_top;
1983 for (i = 0; i < 8; i++) {
1985 for (j = 1; j < 16; j <<= 1) {
1986 for (k = 0; k < j; k++) {
1987 tmp_table[k^j] = (v ^ tmp_table[k]);
1989 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1991 for (j = 0; j < 4; j++) {
1992 for (k = 0; k < 16; k++) {
1993 btable[k] = (uint8_t) tmp_table[k];
1996 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
2000 mask1 = _mm_set1_epi8(0xf);
2001 mask8 = _mm_set1_epi16(0xff);
2004 while (d32 != top) {
2005 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
2006 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
2007 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
2008 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
2010 p0 = _mm_srli_epi16(v0, 8);
2011 p1 = _mm_srli_epi16(v1, 8);
2012 p2 = _mm_srli_epi16(v2, 8);
2013 p3 = _mm_srli_epi16(v3, 8);
2015 tv0 = _mm_and_si128(v0, mask8);
2016 tv1 = _mm_and_si128(v1, mask8);
2017 tv2 = _mm_and_si128(v2, mask8);
2018 tv3 = _mm_and_si128(v3, mask8);
2020 v0 = _mm_packus_epi16(p1, p0);
2021 v1 = _mm_packus_epi16(tv1, tv0);
2022 v2 = _mm_packus_epi16(p3, p2);
2023 v3 = _mm_packus_epi16(tv3, tv2);
2025 p0 = _mm_srli_epi16(v0, 8);
2026 p1 = _mm_srli_epi16(v1, 8);
2027 p2 = _mm_srli_epi16(v2, 8);
2028 p3 = _mm_srli_epi16(v3, 8);
2030 tv0 = _mm_and_si128(v0, mask8);
2031 tv1 = _mm_and_si128(v1, mask8);
2032 tv2 = _mm_and_si128(v2, mask8);
2033 tv3 = _mm_and_si128(v3, mask8);
2035 v0 = _mm_packus_epi16(p2, p0);
2036 v1 = _mm_packus_epi16(p3, p1);
2037 v2 = _mm_packus_epi16(tv2, tv0);
2038 v3 = _mm_packus_epi16(tv3, tv1);
2040 si = _mm_and_si128(v0, mask1);
2041 p0 = _mm_shuffle_epi8(tables[6][0], si);
2042 p1 = _mm_shuffle_epi8(tables[6][1], si);
2043 p2 = _mm_shuffle_epi8(tables[6][2], si);
2044 p3 = _mm_shuffle_epi8(tables[6][3], si);
2046 v0 = _mm_srli_epi32(v0, 4);
2047 si = _mm_and_si128(v0, mask1);
2048 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
2049 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
2050 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
2051 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
2053 si = _mm_and_si128(v1, mask1);
2054 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
2055 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
2056 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
2057 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
2059 v1 = _mm_srli_epi32(v1, 4);
2060 si = _mm_and_si128(v1, mask1);
2061 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
2062 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
2063 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
2064 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
2066 si = _mm_and_si128(v2, mask1);
2067 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
2068 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
2069 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
2070 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
2072 v2 = _mm_srli_epi32(v2, 4);
2073 si = _mm_and_si128(v2, mask1);
2074 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
2075 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
2076 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
2077 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
2079 si = _mm_and_si128(v3, mask1);
2080 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
2081 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
2082 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
2083 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
2085 v3 = _mm_srli_epi32(v3, 4);
2086 si = _mm_and_si128(v3, mask1);
2087 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
2088 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
2089 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
2090 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
2092 tv0 = _mm_unpackhi_epi8(p1, p3);
2093 tv1 = _mm_unpackhi_epi8(p0, p2);
2094 tv2 = _mm_unpacklo_epi8(p1, p3);
2095 tv3 = _mm_unpacklo_epi8(p0, p2);
2097 p0 = _mm_unpackhi_epi8(tv1, tv0);
2098 p1 = _mm_unpacklo_epi8(tv1, tv0);
2099 p2 = _mm_unpackhi_epi8(tv3, tv2);
2100 p3 = _mm_unpacklo_epi8(tv3, tv2);
2102 v0 = _mm_load_si128 ((__m128i *) d32);
2103 v1 = _mm_load_si128 ((__m128i *) (d32+4));
2104 v2 = _mm_load_si128 ((__m128i *) (d32+8));
2105 v3 = _mm_load_si128 ((__m128i *) (d32+12));
2107 p0 = _mm_xor_si128(p0, v0);
2108 p1 = _mm_xor_si128(p1, v1);
2109 p2 = _mm_xor_si128(p2, v2);
2110 p3 = _mm_xor_si128(p3, v3);
2112 _mm_store_si128((__m128i *) d32, p0);
2113 _mm_store_si128((__m128i *) (d32+4), p1);
2114 _mm_store_si128((__m128i *) (d32+8), p2);
2115 _mm_store_si128((__m128i *) (d32+12), p3);
2119 while (d32 != top) {
2120 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
2121 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
2122 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
2123 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
2125 p0 = _mm_srli_epi16(v0, 8);
2126 p1 = _mm_srli_epi16(v1, 8);
2127 p2 = _mm_srli_epi16(v2, 8);
2128 p3 = _mm_srli_epi16(v3, 8);
2130 tv0 = _mm_and_si128(v0, mask8);
2131 tv1 = _mm_and_si128(v1, mask8);
2132 tv2 = _mm_and_si128(v2, mask8);
2133 tv3 = _mm_and_si128(v3, mask8);
2135 v0 = _mm_packus_epi16(p1, p0);
2136 v1 = _mm_packus_epi16(tv1, tv0);
2137 v2 = _mm_packus_epi16(p3, p2);
2138 v3 = _mm_packus_epi16(tv3, tv2);
2140 p0 = _mm_srli_epi16(v0, 8);
2141 p1 = _mm_srli_epi16(v1, 8);
2142 p2 = _mm_srli_epi16(v2, 8);
2143 p3 = _mm_srli_epi16(v3, 8);
2145 tv0 = _mm_and_si128(v0, mask8);
2146 tv1 = _mm_and_si128(v1, mask8);
2147 tv2 = _mm_and_si128(v2, mask8);
2148 tv3 = _mm_and_si128(v3, mask8);
2150 v0 = _mm_packus_epi16(p2, p0);
2151 v1 = _mm_packus_epi16(p3, p1);
2152 v2 = _mm_packus_epi16(tv2, tv0);
2153 v3 = _mm_packus_epi16(tv3, tv1);
2155 si = _mm_and_si128(v0, mask1);
2156 p0 = _mm_shuffle_epi8(tables[6][0], si);
2157 p1 = _mm_shuffle_epi8(tables[6][1], si);
2158 p2 = _mm_shuffle_epi8(tables[6][2], si);
2159 p3 = _mm_shuffle_epi8(tables[6][3], si);
2161 v0 = _mm_srli_epi32(v0, 4);
2162 si = _mm_and_si128(v0, mask1);
2163 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
2164 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
2165 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
2166 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
2168 si = _mm_and_si128(v1, mask1);
2169 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
2170 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
2171 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
2172 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
2174 v1 = _mm_srli_epi32(v1, 4);
2175 si = _mm_and_si128(v1, mask1);
2176 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
2177 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
2178 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
2179 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
2181 si = _mm_and_si128(v2, mask1);
2182 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
2183 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
2184 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
2185 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
2187 v2 = _mm_srli_epi32(v2, 4);
2188 si = _mm_and_si128(v2, mask1);
2189 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
2190 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
2191 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
2192 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
2194 si = _mm_and_si128(v3, mask1);
2195 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
2196 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
2197 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
2198 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
2200 v3 = _mm_srli_epi32(v3, 4);
2201 si = _mm_and_si128(v3, mask1);
2202 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
2203 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
2204 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
2205 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
2207 tv0 = _mm_unpackhi_epi8(p1, p3);
2208 tv1 = _mm_unpackhi_epi8(p0, p2);
2209 tv2 = _mm_unpacklo_epi8(p1, p3);
2210 tv3 = _mm_unpacklo_epi8(p0, p2);
2212 p0 = _mm_unpackhi_epi8(tv1, tv0);
2213 p1 = _mm_unpacklo_epi8(tv1, tv0);
2214 p2 = _mm_unpackhi_epi8(tv3, tv2);
2215 p3 = _mm_unpacklo_epi8(tv3, tv2);
2217 _mm_store_si128((__m128i *) d32, p0);
2218 _mm_store_si128((__m128i *) (d32+4), p1);
2219 _mm_store_si128((__m128i *) (d32+8), p2);
2220 _mm_store_si128((__m128i *) (d32+12), p3);
2224 gf_do_final_region_alignment(&rd);
2229 int gf_w32_split_init(gf_t *gf)
2232 struct gf_split_2_32_lazy_data *ld2;
2233 struct gf_split_4_32_lazy_data *ld4;
2234 struct gf_w32_split_8_8_data *d8;
2235 struct gf_split_8_32_lazy_data *d32;
2236 struct gf_split_16_32_lazy_data *d16;
2240 h = (gf_internal_t *) gf->scratch;
2244 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
2246 /* JSP: First handle single multiplication:
2247 If args == 8, then we're doing split 8 8.
2248 Otherwise, if PCLMUL, we use that.
2249 Otherwise, we use bytwo_p.
2252 if (h->arg1 == 8 && h->arg2 == 8) {
2253 SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
2254 #if defined(INTEL_SSE4_PCLMUL)
2255 } else if (gf_cpu_supports_intel_pclmul) {
2256 if ((0xfffe0000 & h->prim_poly) == 0){
2257 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
2258 } else if ((0xffc00000 & h->prim_poly) == 0){
2259 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
2260 } else if ((0xfe000000 & h->prim_poly) == 0){
2261 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
2265 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
2268 /* Easy cases: 16/32 and 2/32 */
2270 if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
2271 d16 = (struct gf_split_16_32_lazy_data *) h->private;
2272 d16->last_value = 0;
2273 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_16_32_lazy_multiply_region)
2277 if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
2278 ld2 = (struct gf_split_2_32_lazy_data *) h->private;
2279 ld2->last_value = 0;
2281 if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
2282 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
2285 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
2286 if(h->region_type & GF_REGION_SIMD) return 0;
2293 /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
2296 if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
2297 ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
2298 ld4 = (struct gf_split_4_32_lazy_data *) h->private;
2299 ld4->last_value = 0;
2300 if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
2301 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
2302 } else if (gf_cpu_supports_arm_neon) {
2304 gf_w32_neon_split_init(gf);
2306 } else if (h->region_type & GF_REGION_ALTMAP) {
2308 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
2312 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
2318 /* 8/32 or Default + no SSE */
2320 if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) ||
2321 h->mult_type == GF_MULT_DEFAULT) {
2322 d32 = (struct gf_split_8_32_lazy_data *) h->private;
2323 d32->last_value = 0;
2324 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
2328 /* Finally, if args == 8, then we have to set up the tables here. */
2330 if (h->arg1 == 8 && h->arg2 == 8) {
2331 d8 = (struct gf_w32_split_8_8_data *) h->private;
2333 SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
2334 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
2336 for (exp = 0; exp < 7; exp++) {
2337 for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
2338 for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
2339 d8->tables[exp][1][1] = basep;
2340 for (i = 2; i < 256; i++) {
2342 p = d8->tables[exp][i^1][1];
2343 d8->tables[exp][i][1] = p ^ basep;
2345 p = d8->tables[exp][i>>1][1];
2346 d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
2349 for (i = 1; i < 256; i++) {
2350 p = d8->tables[exp][i][1];
2351 for (j = 1; j < 256; j++) {
2353 d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
2355 d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
2359 for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
2364 /* If we get here, then the arguments were bad. */
2370 int gf_w32_group_init(gf_t *gf)
2372 uint32_t i, j, p, index;
2373 struct gf_w32_group_data *gd;
2374 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2380 gd = (struct gf_w32_group_data *) h->private;
2381 gd->shift = (uint32_t *) (&(gd->memory));
2382 gd->reduce = gd->shift + (1 << g_s);
2384 gd->rmask = (1 << g_r) - 1;
2387 gd->tshift = 32 % g_s;
2388 if (gd->tshift == 0) gd->tshift = g_s;
2389 gd->tshift = (32 - gd->tshift);
2390 gd->tshift = ((gd->tshift-1)/g_r) * g_r;
2393 for (i = 0; i < ((uint32_t)1 << g_r); i++) {
2396 for (j = 0; j < g_r; j++) {
2398 p ^= (h->prim_poly << j);
2400 index ^= (h->prim_poly >> (32-j));
2403 gd->reduce[index] = p;
2407 SET_FUNCTION(gf,multiply,w32,gf_w32_group_s_equals_r_multiply)
2408 SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_s_equals_r_multiply_region)
2410 SET_FUNCTION(gf,multiply,w32,gf_w32_group_multiply)
2411 SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_multiply_region)
2413 SET_FUNCTION(gf,divide,w32,NULL)
2414 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
2422 gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
2424 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2425 gf_t *base_gf = h->base_gf;
2426 uint32_t b0 = b & 0x0000ffff;
2427 uint32_t b1 = (b & 0xffff0000) >> 16;
2428 uint32_t a0 = a & 0x0000ffff;
2429 uint32_t a1 = (a & 0xffff0000) >> 16;
2432 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2434 rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1);
2438 /* JSP: This could be made faster. Someday, when I'm bored. */
2442 gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b)
2444 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2445 uint32_t b0 = b & 0x0000ffff;
2446 uint32_t b1 = b >> 16;
2447 uint32_t a0 = a & 0x0000ffff;
2448 uint32_t a1 = a >> 16;
2449 uint32_t a1b1, prod;
2450 uint16_t *log, *alog;
2451 struct gf_w32_composite_data *cd;
2453 cd = (struct gf_w32_composite_data *) h->private;
2457 a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
2458 prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
2459 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
2460 prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
2462 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
2468 * Composite field division trick (explained in 2007 tech report)
2470 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
2474 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
2476 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
2478 * let d = b1c1 and d+1 = b0c0
2480 * solve s*b1c1+b1c0+b0c1 = 0
2482 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
2492 gf_w32_composite_inverse(gf_t *gf, uint32_t a)
2494 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2495 gf_t *base_gf = h->base_gf;
2496 uint16_t a0 = a & 0x0000ffff;
2497 uint16_t a1 = (a & 0xffff0000) >> 16;
2498 uint16_t c0, c1, d, tmp;
2500 uint16_t a0inv, a1inv;
2503 a1inv = base_gf->inverse.w32(base_gf, a1);
2504 c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
2506 } else if (a1 == 0) {
2507 c0 = base_gf->inverse.w32(base_gf, a0);
2510 a1inv = base_gf->inverse.w32(base_gf, a1);
2511 a0inv = base_gf->inverse.w32(base_gf, a0);
2513 d = base_gf->multiply.w32(base_gf, a1, a0inv);
2515 tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
2516 tmp = base_gf->inverse.w32(base_gf, tmp);
2518 d = base_gf->multiply.w32(base_gf, d, tmp);
2520 c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
2521 c1 = base_gf->multiply.w32(base_gf, d, a1inv);
2524 c = c0 | (c1 << 16);
2531 gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
2533 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2534 gf_t *base_gf = h->base_gf;
2535 uint32_t b0 = val & 0x0000ffff;
2536 uint32_t b1 = (val & 0xffff0000) >> 16;
2537 uint32_t *s32, *d32, *top;
2538 uint16_t a0, a1, a1b1, *log, *alog;
2541 struct gf_w32_composite_data *cd;
2543 cd = (struct gf_w32_composite_data *) h->private;
2547 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
2548 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
2557 a0 = *s32 & 0x0000ffff;
2558 a1 = (*s32 & 0xffff0000) >> 16;
2559 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2561 *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
2562 ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16));
2568 a0 = *s32 & 0x0000ffff;
2569 a1 = (*s32 & 0xffff0000) >> 16;
2570 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2572 *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
2573 ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16));
2581 a0 = *s32 & 0x0000ffff;
2582 a1 = (*s32 & 0xffff0000) >> 16;
2583 a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
2585 prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
2586 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
2587 prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
2589 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
2597 a0 = *s32 & 0x0000ffff;
2598 a1 = (*s32 & 0xffff0000) >> 16;
2599 a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
2601 prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
2602 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
2603 prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
2605 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
2618 gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
2620 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2621 gf_t *base_gf = h->base_gf;
2622 uint16_t val0 = val & 0x0000ffff;
2623 uint16_t val1 = (val & 0xffff0000) >> 16;
2626 uint8_t *slow, *shigh;
2627 uint8_t *dlow, *dhigh, *top;
2629 /* JSP: I want the two pointers aligned wrt each other on 16 byte
2630 boundaries. So I'm going to make sure that the area on
2631 which the two operate is a multiple of 32. Of course, that
2632 junks up the mapping, but so be it -- that's why we have extract_word.... */
2634 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
2635 gf_do_initial_region_alignment(&rd);
2637 slow = (uint8_t *) rd.s_start;
2638 dlow = (uint8_t *) rd.d_start;
2639 top = (uint8_t *) rd.d_top;
2640 sub_reg_size = (top - dlow)/2;
2641 shigh = slow + sub_reg_size;
2642 dhigh = dlow + sub_reg_size;
2644 base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
2645 base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
2646 base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
2647 base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
2648 base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
2650 gf_do_final_region_alignment(&rd);
2654 int gf_w32_composite_init(gf_t *gf)
2656 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2657 struct gf_w32_composite_data *cd;
2659 if (h->base_gf == NULL) return 0;
2661 cd = (struct gf_w32_composite_data *) h->private;
2662 cd->log = gf_w16_get_log_table(h->base_gf);
2663 cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
2665 if (h->region_type & GF_REGION_ALTMAP) {
2666 SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region_alt)
2668 SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region)
2671 if (cd->log == NULL) {
2672 SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_recursive)
2674 SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_inline)
2676 SET_FUNCTION(gf,divide,w32,NULL)
2677 SET_FUNCTION(gf,inverse,w32,gf_w32_composite_inverse)
2684 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2688 case GF_MULT_BYTWO_p:
2689 case GF_MULT_BYTWO_b:
2690 return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64;
2693 return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) +
2694 sizeof(uint32_t) * (1 << arg1) +
2695 sizeof(uint32_t) * (1 << arg2) + 64;
2697 case GF_MULT_DEFAULT:
2699 case GF_MULT_SPLIT_TABLE:
2700 if (arg1 == 8 && arg2 == 8){
2701 return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64;
2703 if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) {
2704 return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64;
2706 if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
2707 return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
2709 if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
2710 (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
2711 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
2713 if ((arg1 == 4 && arg2 == 32) ||
2714 (arg2 == 4 && arg1 == 32) ||
2715 mult_type == GF_MULT_DEFAULT) {
2716 return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
2719 case GF_MULT_CARRY_FREE:
2720 return sizeof(gf_internal_t);
2722 case GF_MULT_CARRY_FREE_GK:
2723 return sizeof(gf_internal_t) + sizeof(uint64_t)*2;
2726 return sizeof(gf_internal_t);
2728 case GF_MULT_COMPOSITE:
2729 return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64;
2738 int gf_w32_init(gf_t *gf)
2742 h = (gf_internal_t *) gf->scratch;
2744 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2746 if (h->prim_poly == 0) {
2747 if (h->mult_type == GF_MULT_COMPOSITE) {
2748 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2749 if (h->prim_poly == 0) return 0; /* This shouldn't happen */
2752 /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
2754 /* h->prim_poly = 0xc5; */
2756 /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
2758 h->prim_poly = 0x400007;
2762 /* No leading one */
2764 if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
2766 SET_FUNCTION(gf,multiply,w32,NULL)
2767 SET_FUNCTION(gf,divide,w32,NULL)
2768 SET_FUNCTION(gf,inverse,w32,NULL)
2769 SET_FUNCTION(gf,multiply_region,w32,NULL)
2771 switch(h->mult_type) {
2772 case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break;
2773 case GF_MULT_CARRY_FREE_GK: if (gf_w32_cfmgk_init(gf) == 0) return 0; break;
2774 case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break;
2775 case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break;
2776 case GF_MULT_DEFAULT:
2777 case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
2778 case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break;
2779 case GF_MULT_BYTWO_p:
2780 case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break;
2783 if (h->divide_type == GF_DIVIDE_EUCLID) {
2784 SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
2785 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
2786 } else if (h->divide_type == GF_DIVIDE_MATRIX) {
2787 SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
2788 SET_FUNCTION(gf,inverse,w32,gf_w32_matrix)
2791 if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
2792 SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
2794 if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
2795 SET_FUNCTION(gf,inverse,w32,gf_w32_inverse_from_divide)
2797 if (h->region_type == GF_REGION_CAUCHY) {
2798 SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2799 SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2800 } else if (h->region_type & GF_REGION_ALTMAP) {
2801 if (h->mult_type == GF_MULT_COMPOSITE) {
2802 SET_FUNCTION(gf,extract_word,w32,gf_w32_composite_extract_word)
2804 SET_FUNCTION(gf,extract_word,w32,gf_w32_split_extract_word)
2807 SET_FUNCTION(gf,extract_word,w32,gf_w32_extract_word)