2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Routines for 16-bit Galois fields
17 #define AB2(ip, am1 ,am2, b, t1, t2) {\
20 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
21 b = (t1 ^ (t2 & ip));}
23 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
24 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
25 t2 = _mm_and_si128(va, m2); \
26 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
27 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
29 #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
31 #define GF_FIRST_BIT (1 << 15)
32 #define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
36 gf_val_32_t gf_w16_inverse_from_divide (gf_t *gf, gf_val_32_t a)
38 return gf->divide.w32(gf, 1, a);
43 gf_val_32_t gf_w16_divide_from_inverse (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
45 b = gf->inverse.w32(gf, b);
46 return gf->multiply.w32(gf, a, b);
51 gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
57 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
58 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
60 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
61 gf_do_initial_region_alignment(&rd);
63 s16 = (uint16_t *) rd.s_start;
64 d16 = (uint16_t *) rd.d_start;
67 while (d16 < ((uint16_t *) rd.d_top)) {
68 *d16 ^= gf->multiply.w32(gf, val, *s16);
73 while (d16 < ((uint16_t *) rd.d_top)) {
74 *d16 = gf->multiply.w32(gf, val, *s16);
79 gf_do_final_region_alignment(&rd);
82 #if defined(INTEL_SSE4_PCLMUL)
85 gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
94 gf_internal_t * h = gf->scratch;
95 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
97 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
98 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
100 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
101 gf_do_initial_region_alignment(&rd);
103 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
105 s16 = (uint16_t *) rd.s_start;
106 d16 = (uint16_t *) rd.d_start;
109 while (d16 < ((uint16_t *) rd.d_top)) {
111 /* see gf_w16_clm_multiply() to see explanation of method */
113 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
114 result = _mm_clmulepi64_si128 (a, b, 0);
115 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
116 result = _mm_xor_si128 (result, w);
117 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
118 result = _mm_xor_si128 (result, w);
120 *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
125 while (d16 < ((uint16_t *) rd.d_top)) {
127 /* see gf_w16_clm_multiply() to see explanation of method */
129 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
130 result = _mm_clmulepi64_si128 (a, b, 0);
131 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
132 result = _mm_xor_si128 (result, w);
133 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
134 result = _mm_xor_si128 (result, w);
136 *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
141 gf_do_final_region_alignment(&rd);
145 #if defined(INTEL_SSE4_PCLMUL)
148 gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
158 gf_internal_t * h = gf->scratch;
159 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
161 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
162 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
164 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
166 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
167 gf_do_initial_region_alignment(&rd);
169 s16 = (uint16_t *) rd.s_start;
170 d16 = (uint16_t *) rd.d_start;
173 while (d16 < ((uint16_t *) rd.d_top)) {
175 /* see gf_w16_clm_multiply() to see explanation of method */
177 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
178 result = _mm_clmulepi64_si128 (a, b, 0);
179 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
180 result = _mm_xor_si128 (result, w);
181 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
182 result = _mm_xor_si128 (result, w);
183 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
184 result = _mm_xor_si128 (result, w);
186 *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
191 while (d16 < ((uint16_t *) rd.d_top)) {
193 /* see gf_w16_clm_multiply() to see explanation of method */
195 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
196 result = _mm_clmulepi64_si128 (a, b, 0);
197 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
198 result = _mm_xor_si128 (result, w);
199 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
200 result = _mm_xor_si128 (result, w);
201 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
202 result = _mm_xor_si128 (result, w);
204 *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
209 gf_do_final_region_alignment(&rd);
213 #if defined(INTEL_SSE4_PCLMUL)
216 gf_w16_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
226 gf_internal_t * h = gf->scratch;
227 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
229 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
230 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
232 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
233 gf_do_initial_region_alignment(&rd);
235 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
237 s16 = (uint16_t *) rd.s_start;
238 d16 = (uint16_t *) rd.d_start;
241 while (d16 < ((uint16_t *) rd.d_top)) {
243 /* see gf_w16_clm_multiply() to see explanation of method */
245 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
246 result = _mm_clmulepi64_si128 (a, b, 0);
247 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
248 result = _mm_xor_si128 (result, w);
249 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
250 result = _mm_xor_si128 (result, w);
251 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
252 result = _mm_xor_si128 (result, w);
253 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
254 result = _mm_xor_si128 (result, w);
256 *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
261 while (d16 < ((uint16_t *) rd.d_top)) {
263 /* see gf_w16_clm_multiply() to see explanation of method */
265 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
266 result = _mm_clmulepi64_si128 (a, b, 0);
267 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
268 result = _mm_xor_si128 (result, w);
269 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
270 result = _mm_xor_si128 (result, w);
271 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
272 result = _mm_xor_si128 (result, w);
273 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
274 result = _mm_xor_si128 (result, w);
276 *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
281 gf_do_final_region_alignment(&rd);
287 gf_val_32_t gf_w16_euclid (gf_t *gf, gf_val_32_t b)
289 gf_val_32_t e_i, e_im1, e_ip1;
290 gf_val_32_t d_i, d_im1, d_ip1;
291 gf_val_32_t y_i, y_im1, y_ip1;
294 if (b == 0) return -1;
295 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
298 for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
308 while (d_ip1 >= d_i) {
309 c_i ^= (1 << (d_ip1 - d_i));
310 e_ip1 ^= (e_i << (d_ip1 - d_i));
311 if (e_ip1 == 0) return 0;
312 while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
315 y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
329 gf_val_32_t gf_w16_extract_word(gf_t *gf, void *start, int bytes, int index)
333 r16 = (uint16_t *) start;
339 gf_val_32_t gf_w16_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
347 h = (gf_internal_t *) gf->scratch;
348 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
349 r16 = (uint16_t *) start;
350 if (r16 + index < (uint16_t *) rd.d_start) return r16[index];
351 if (r16 + index >= (uint16_t *) rd.d_top) return r16[index];
352 index -= (((uint16_t *) rd.d_start) - r16);
353 r8 = (uint8_t *) rd.d_start;
354 top = (uint8_t *) rd.d_top;
355 sub_size = (top-r8)/2;
357 a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
358 b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
359 return (a | (b << 8));
363 gf_val_32_t gf_w16_split_extract_word(gf_t *gf, void *start, int bytes, int index)
369 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
370 r16 = (uint16_t *) start;
371 if (r16 + index < (uint16_t *) rd.d_start) return r16[index];
372 if (r16 + index >= (uint16_t *) rd.d_top) return r16[index];
373 index -= (((uint16_t *) rd.d_start) - r16);
374 r8 = (uint8_t *) rd.d_start;
375 r8 += ((index & 0xfffffff0)*2);
385 gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
387 return gf_bitmatrix_inverse(b, 16, ((gf_internal_t *) (gf->scratch))->prim_poly);
390 /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
391 include it for completeness. It does have the feature that it requires no
395 #if defined(INTEL_SSE4_PCLMUL)
399 gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
407 gf_internal_t * h = gf->scratch;
409 a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
410 b = _mm_insert_epi32 (a, b16, 0);
412 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
414 /* Do the initial multiply */
416 result = _mm_clmulepi64_si128 (a, b, 0);
418 /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
419 have to do the reduction at most twice, because (w-2)/z == 2. Where
420 z is equal to the number of zeros after the leading 1
422 _mm_clmulepi64_si128 is the carryless multiply operation. Here
423 _mm_srli_si128 shifts the result to the right by 2 bytes. This allows
424 us to multiply the prim_poly by the leading bits of the result. We
425 then xor the result of that operation back with the result.*/
427 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
428 result = _mm_xor_si128 (result, w);
429 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
430 result = _mm_xor_si128 (result, w);
432 /* Extracts 32 bit value from result. */
434 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
440 #if defined(INTEL_SSE4_PCLMUL)
444 gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
452 gf_internal_t * h = gf->scratch;
454 a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
455 b = _mm_insert_epi32 (a, b16, 0);
457 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
459 /* Do the initial multiply */
461 result = _mm_clmulepi64_si128 (a, b, 0);
463 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
464 result = _mm_xor_si128 (result, w);
465 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
466 result = _mm_xor_si128 (result, w);
467 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
468 result = _mm_xor_si128 (result, w);
470 /* Extracts 32 bit value from result. */
472 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
478 #if defined(INTEL_SSE4_PCLMUL)
482 gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
490 gf_internal_t * h = gf->scratch;
492 a = _mm_insert_epi32 (_mm_setzero_si128(), a16, 0);
493 b = _mm_insert_epi32 (a, b16, 0);
495 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
497 /* Do the initial multiply */
499 result = _mm_clmulepi64_si128 (a, b, 0);
501 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
502 result = _mm_xor_si128 (result, w);
503 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
504 result = _mm_xor_si128 (result, w);
505 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
506 result = _mm_xor_si128 (result, w);
507 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
508 result = _mm_xor_si128 (result, w);
510 /* Extracts 32 bit value from result. */
512 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
522 gf_w16_shift_multiply (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
524 gf_val_32_t product, i, pp, a, b;
529 h = (gf_internal_t *) gf->scratch;
534 for (i = 0; i < GF_FIELD_WIDTH; i++) {
535 if (a & (1 << i)) product ^= (b << i);
537 for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
538 if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
544 int gf_w16_shift_init(gf_t *gf)
546 SET_FUNCTION(gf,multiply,w32,gf_w16_shift_multiply)
551 int gf_w16_cfm_init(gf_t *gf)
553 #if defined(INTEL_SSE4_PCLMUL)
554 if (gf_cpu_supports_intel_pclmul) {
557 h = (gf_internal_t *) gf->scratch;
559 /*Ben: Determining how many reductions to do */
561 if ((0xfe00 & h->prim_poly) == 0) {
562 SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
563 SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
564 } else if((0xf000 & h->prim_poly) == 0) {
565 SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
566 SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
567 } else if ((0xe000 & h->prim_poly) == 0) {
568 SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
569 SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
580 /* KMG: GF_MULT_LOGTABLE: */
584 gf_w16_log_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
588 struct gf_w16_logtable_data *ltd;
591 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
592 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
594 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
595 gf_do_initial_region_alignment(&rd);
597 ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
598 s16 = (uint16_t *) rd.s_start;
599 d16 = (uint16_t *) rd.d_start;
601 lv = ltd->log_tbl[val];
604 while (d16 < (uint16_t *) rd.d_top) {
605 *d16 ^= (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]);
610 while (d16 < (uint16_t *) rd.d_top) {
611 *d16 = (*s16 == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[*s16]]);
616 gf_do_final_region_alignment(&rd);
622 gf_w16_log_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
624 struct gf_w16_logtable_data *ltd;
626 ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
627 return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(int) ltd->log_tbl[a] + (int) ltd->log_tbl[b]];
633 gf_w16_log_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
636 struct gf_w16_logtable_data *ltd;
638 if (a == 0 || b == 0) return 0;
639 ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
641 log_sum = (int) ltd->log_tbl[a] - (int) ltd->log_tbl[b];
642 return (ltd->d_antilog[log_sum]);
647 gf_w16_log_inverse(gf_t *gf, gf_val_32_t a)
649 struct gf_w16_logtable_data *ltd;
651 ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
652 return (ltd->inv_tbl[a]);
656 int gf_w16_log_init(gf_t *gf)
659 struct gf_w16_logtable_data *ltd;
663 h = (gf_internal_t *) gf->scratch;
666 for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++)
668 ltd->d_antilog = ltd->antilog_tbl + GF_MULT_GROUP_SIZE;
671 for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
672 if (ltd->log_tbl[b] != 0) check = 1;
674 ltd->antilog_tbl[i] = b;
675 ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = b;
677 if (b & GF_FIELD_SIZE) {
678 b = b ^ h->prim_poly;
682 /* If you can't construct the log table, there's a problem. This code is used for
683 some other implementations (e.g. in SPLIT), so if the log table doesn't work in
684 that instance, use CARRY_FREE / SHIFT instead. */
687 if (h->mult_type != GF_MULT_LOG_TABLE) {
688 if (gf_cpu_supports_intel_pclmul) {
689 return gf_w16_cfm_init(gf);
691 return gf_w16_shift_init(gf);
693 _gf_errno = GF_E_LOGPOLY;
698 ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */
700 for (i = 2; i < GF_FIELD_SIZE; i++) {
701 ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
704 SET_FUNCTION(gf,inverse,w32,gf_w16_log_inverse)
705 SET_FUNCTION(gf,divide,w32,gf_w16_log_divide)
706 SET_FUNCTION(gf,multiply,w32,gf_w16_log_multiply)
707 SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_multiply_region)
712 /* JSP: GF_MULT_SPLIT_TABLE: Using 8 multiplication tables to leverage SSE instructions.
716 /* Ben: Does alternate mapping multiplication using a split table in the
717 lazy method without sse instructions*/
721 gf_w16_split_4_16_lazy_nosse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
723 uint64_t i, j, c, prod;
724 uint8_t *s8, *d8, *top;
725 uint16_t table[4][16];
728 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
729 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
731 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
732 gf_do_initial_region_alignment(&rd);
734 /*Ben: Constructs lazy multiplication table*/
736 for (j = 0; j < 16; j++) {
737 for (i = 0; i < 4; i++) {
739 table[i][j] = gf->multiply.w32(gf, c, val);
743 /*Ben: s8 is the start of source, d8 is the start of dest, top is end of dest region. */
745 s8 = (uint8_t *) rd.s_start;
746 d8 = (uint8_t *) rd.d_start;
747 top = (uint8_t *) rd.d_top;
752 /*Ben: Multiplies across 16 two byte quantities using alternate mapping
753 high bits are on the left, low bits are on the right. */
757 /*Ben: If the xor flag is set, the product should include what is in dest */
758 prod = (xor) ? ((uint16_t)(*d8)<<8) ^ *(d8+16) : 0;
760 /*Ben: xors all 4 table lookups into the product variable*/
762 prod ^= ((table[0][*(s8+16)&0xf]) ^
763 (table[1][(*(s8+16)&0xf0)>>4]) ^
764 (table[2][*(s8)&0xf]) ^
765 (table[3][(*(s8)&0xf0)>>4]));
767 /*Ben: Stores product in the destination and moves on*/
769 *d8 = (uint8_t)(prod >> 8);
770 *(d8+16) = (uint8_t)(prod & 0x00ff);
777 gf_do_final_region_alignment(&rd);
782 gf_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
784 uint64_t i, j, a, c, prod;
785 uint16_t *s16, *d16, *top;
786 uint16_t table[4][16];
789 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
790 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
792 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
793 gf_do_initial_region_alignment(&rd);
795 for (j = 0; j < 16; j++) {
796 for (i = 0; i < 4; i++) {
798 table[i][j] = gf->multiply.w32(gf, c, val);
802 s16 = (uint16_t *) rd.s_start;
803 d16 = (uint16_t *) rd.d_start;
804 top = (uint16_t *) rd.d_top;
808 prod = (xor) ? *d16 : 0;
809 for (i = 0; i < 4; i++) {
810 prod ^= table[i][a&0xf];
821 gf_w16_split_8_16_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
823 uint64_t j, k, v, a, prod, *s64, *d64, *top64;
825 uint64_t htable[256], ltable[256];
828 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
829 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
831 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
832 gf_do_initial_region_alignment(&rd);
834 h = (gf_internal_t *) gf->scratch;
838 for (j = 1; j < 256; j <<= 1) {
839 for (k = 0; k < j; k++) ltable[k^j] = (v ^ ltable[k]);
840 v = GF_MULTBY_TWO(v);
843 for (j = 1; j < 256; j <<= 1) {
844 for (k = 0; k < j; k++) htable[k^j] = (v ^ htable[k]);
845 v = GF_MULTBY_TWO(v);
848 s64 = (uint64_t *) rd.s_start;
849 d64 = (uint64_t *) rd.d_start;
850 top64 = (uint64_t *) rd.d_top;
852 /* Does Unrolling Matter? -- Doesn't seem to.
853 while (d64 != top64) {
856 prod = htable[a >> 56];
858 prod ^= ltable[a >> 56];
862 prod ^= htable[a >> 56];
864 prod ^= ltable[a >> 56];
868 prod ^= htable[a >> 56];
870 prod ^= ltable[a >> 56];
874 prod ^= htable[a >> 56];
876 prod ^= ltable[a >> 56];
877 prod ^= ((xor) ? *d64 : 0);
884 while (d64 != top64) {
888 for (j = 0; j < 4; j++) {
890 prod ^= htable[a >> 56];
892 prod ^= ltable[a >> 56];
896 //JSP: We can move the conditional outside the while loop, but we need to fully test it to understand which is better.
898 prod ^= ((xor) ? *d64 : 0);
903 gf_do_final_region_alignment(&rd);
907 gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
911 struct gf_w16_lazytable_data *ltd;
914 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
915 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
917 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
918 gf_do_initial_region_alignment(&rd);
920 h = (gf_internal_t *) gf->scratch;
921 ltd = (struct gf_w16_lazytable_data *) h->private;
923 ltd->lazytable[0] = 0;
931 ltd->lazytable[c] = a;
933 if (c & (1 << GF_FIELD_WIDTH)) c ^= pp;
935 if (a & (1 << GF_FIELD_WIDTH)) a ^= pp;
939 for (c = 1; c < GF_FIELD_SIZE; c++) {
940 ltd->lazytable[c] = gf_w16_shift_multiply(gf, c, val);
943 gf_two_byte_region_table_multiply(&rd, ltd->lazytable);
944 gf_do_final_region_alignment(&rd);
950 gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
952 uint64_t i, j, *s64, *d64, *top64;;
958 __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4], tta, ttb, lmask;
960 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
961 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
963 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
964 gf_do_initial_region_alignment(&rd);
966 for (j = 0; j < 16; j++) {
967 for (i = 0; i < 4; i++) {
969 prod = gf->multiply.w32(gf, c, val);
970 low[i][j] = (prod & 0xff);
971 high[i][j] = (prod >> 8);
975 for (i = 0; i < 4; i++) {
976 tlow[i] = _mm_loadu_si128((__m128i *)low[i]);
977 thigh[i] = _mm_loadu_si128((__m128i *)high[i]);
980 s64 = (uint64_t *) rd.s_start;
981 d64 = (uint64_t *) rd.d_start;
982 top64 = (uint64_t *) rd.d_top;
984 mask = _mm_set1_epi8 (0x0f);
985 lmask = _mm_set1_epi16 (0xff);
988 while (d64 != top64) {
990 ta = _mm_load_si128((__m128i *) s64);
991 tb = _mm_load_si128((__m128i *) (s64+2));
993 tta = _mm_srli_epi16(ta, 8);
994 ttb = _mm_srli_epi16(tb, 8);
995 tpl = _mm_and_si128(tb, lmask);
996 tph = _mm_and_si128(ta, lmask);
998 tb = _mm_packus_epi16(tpl, tph);
999 ta = _mm_packus_epi16(ttb, tta);
1001 ti = _mm_and_si128 (mask, tb);
1002 tph = _mm_shuffle_epi8 (thigh[0], ti);
1003 tpl = _mm_shuffle_epi8 (tlow[0], ti);
1005 tb = _mm_srli_epi16(tb, 4);
1006 ti = _mm_and_si128 (mask, tb);
1007 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
1008 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
1010 ti = _mm_and_si128 (mask, ta);
1011 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
1012 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
1014 ta = _mm_srli_epi16(ta, 4);
1015 ti = _mm_and_si128 (mask, ta);
1016 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
1017 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
1019 ta = _mm_unpackhi_epi8(tpl, tph);
1020 tb = _mm_unpacklo_epi8(tpl, tph);
1022 tta = _mm_load_si128((__m128i *) d64);
1023 ta = _mm_xor_si128(ta, tta);
1024 ttb = _mm_load_si128((__m128i *) (d64+2));
1025 tb = _mm_xor_si128(tb, ttb);
1026 _mm_store_si128 ((__m128i *)d64, ta);
1027 _mm_store_si128 ((__m128i *)(d64+2), tb);
1034 while (d64 != top64) {
1036 ta = _mm_load_si128((__m128i *) s64);
1037 tb = _mm_load_si128((__m128i *) (s64+2));
1039 tta = _mm_srli_epi16(ta, 8);
1040 ttb = _mm_srli_epi16(tb, 8);
1041 tpl = _mm_and_si128(tb, lmask);
1042 tph = _mm_and_si128(ta, lmask);
1044 tb = _mm_packus_epi16(tpl, tph);
1045 ta = _mm_packus_epi16(ttb, tta);
1047 ti = _mm_and_si128 (mask, tb);
1048 tph = _mm_shuffle_epi8 (thigh[0], ti);
1049 tpl = _mm_shuffle_epi8 (tlow[0], ti);
1051 tb = _mm_srli_epi16(tb, 4);
1052 ti = _mm_and_si128 (mask, tb);
1053 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
1054 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
1056 ti = _mm_and_si128 (mask, ta);
1057 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
1058 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
1060 ta = _mm_srli_epi16(ta, 4);
1061 ti = _mm_and_si128 (mask, ta);
1062 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
1063 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
1065 ta = _mm_unpackhi_epi8(tpl, tph);
1066 tb = _mm_unpacklo_epi8(tpl, tph);
1068 _mm_store_si128 ((__m128i *)d64, ta);
1069 _mm_store_si128 ((__m128i *)(d64+2), tb);
1076 gf_do_final_region_alignment(&rd);
1083 gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1085 uint64_t i, j, *s64, *d64, *top64;;
1088 uint8_t high[4][16];
1090 __m128i mask, ta, tb, ti, tpl, tph, tlow[4], thigh[4];
1092 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1093 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1095 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1096 gf_do_initial_region_alignment(&rd);
1098 for (j = 0; j < 16; j++) {
1099 for (i = 0; i < 4; i++) {
1101 prod = gf->multiply.w32(gf, c, val);
1102 low[i][j] = (prod & 0xff);
1103 high[i][j] = (prod >> 8);
1107 for (i = 0; i < 4; i++) {
1108 tlow[i] = _mm_loadu_si128((__m128i *)low[i]);
1109 thigh[i] = _mm_loadu_si128((__m128i *)high[i]);
1112 s64 = (uint64_t *) rd.s_start;
1113 d64 = (uint64_t *) rd.d_start;
1114 top64 = (uint64_t *) rd.d_top;
1116 mask = _mm_set1_epi8 (0x0f);
1119 while (d64 != top64) {
1121 ta = _mm_load_si128((__m128i *) s64);
1122 tb = _mm_load_si128((__m128i *) (s64+2));
1124 ti = _mm_and_si128 (mask, tb);
1125 tph = _mm_shuffle_epi8 (thigh[0], ti);
1126 tpl = _mm_shuffle_epi8 (tlow[0], ti);
1128 tb = _mm_srli_epi16(tb, 4);
1129 ti = _mm_and_si128 (mask, tb);
1130 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
1131 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
1133 ti = _mm_and_si128 (mask, ta);
1134 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
1135 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
1137 ta = _mm_srli_epi16(ta, 4);
1138 ti = _mm_and_si128 (mask, ta);
1139 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
1140 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
1142 ta = _mm_load_si128((__m128i *) d64);
1143 tph = _mm_xor_si128(tph, ta);
1144 _mm_store_si128 ((__m128i *)d64, tph);
1145 tb = _mm_load_si128((__m128i *) (d64+2));
1146 tpl = _mm_xor_si128(tpl, tb);
1147 _mm_store_si128 ((__m128i *)(d64+2), tpl);
1153 while (d64 != top64) {
1155 ta = _mm_load_si128((__m128i *) s64);
1156 tb = _mm_load_si128((__m128i *) (s64+2));
1158 ti = _mm_and_si128 (mask, tb);
1159 tph = _mm_shuffle_epi8 (thigh[0], ti);
1160 tpl = _mm_shuffle_epi8 (tlow[0], ti);
1162 tb = _mm_srli_epi16(tb, 4);
1163 ti = _mm_and_si128 (mask, tb);
1164 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[1], ti), tpl);
1165 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[1], ti), tph);
1167 ti = _mm_and_si128 (mask, ta);
1168 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[2], ti), tpl);
1169 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[2], ti), tph);
1171 ta = _mm_srli_epi16(ta, 4);
1172 ti = _mm_and_si128 (mask, ta);
1173 tpl = _mm_xor_si128(_mm_shuffle_epi8 (tlow[3], ti), tpl);
1174 tph = _mm_xor_si128(_mm_shuffle_epi8 (thigh[3], ti), tph);
1176 _mm_store_si128 ((__m128i *)d64, tph);
1177 _mm_store_si128 ((__m128i *)(d64+2), tpl);
1184 gf_do_final_region_alignment(&rd);
1190 gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1192 uint32_t alow, blow;
1193 struct gf_w16_split_8_8_data *d8;
1196 h = (gf_internal_t *) gf->scratch;
1197 d8 = (struct gf_w16_split_8_8_data *) h->private;
1204 return d8->tables[0][alow][blow] ^
1205 d8->tables[1][alow][b] ^
1206 d8->tables[1][a][blow] ^
1207 d8->tables[2][a][b];
1211 int gf_w16_split_init(gf_t *gf)
1214 struct gf_w16_split_8_8_data *d8;
1216 uint32_t p, basep, tmp;
1218 h = (gf_internal_t *) gf->scratch;
1220 if (h->arg1 == 8 && h->arg2 == 8) {
1221 d8 = (struct gf_w16_split_8_8_data *) h->private;
1223 for (exp = 0; exp < 3; exp++) {
1224 for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
1225 for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
1226 d8->tables[exp][1][1] = basep;
1227 for (i = 2; i < 256; i++) {
1229 p = d8->tables[exp][i^1][1];
1230 d8->tables[exp][i][1] = p ^ basep;
1232 p = d8->tables[exp][i>>1][1];
1233 d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
1236 for (i = 1; i < 256; i++) {
1237 p = d8->tables[exp][i][1];
1238 for (j = 1; j < 256; j++) {
1240 d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
1242 tmp = d8->tables[exp][i][j>>1];
1243 d8->tables[exp][i][j] = GF_MULTBY_TWO(tmp);
1247 for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
1249 SET_FUNCTION(gf,multiply,w32,gf_w16_split_8_8_multiply)
1250 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
1255 /* We'll be using LOG for multiplication, unless the pp isn't primitive.
1256 In that case, we'll be using SHIFT. */
1258 gf_w16_log_init(gf);
1263 if (gf_cpu_supports_intel_ssse3) {
1264 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
1267 if (gf_cpu_supports_arm_neon) {
1268 gf_w16_neon_split_init(gf);
1271 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
1272 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1276 if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
1277 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
1279 } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
1280 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1281 if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
1282 if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
1283 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
1284 else if(h->region_type & GF_REGION_NOSIMD)
1285 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
1286 #if defined(INTEL_SSSE3)
1287 else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
1288 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
1292 if(h->region_type & GF_REGION_SIMD)
1294 else if(h->region_type & GF_REGION_ALTMAP)
1295 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
1297 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
1298 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1307 int gf_w16_table_init(gf_t *gf)
1309 gf_w16_log_init(gf);
1311 SET_FUNCTION(gf,multiply_region,w32,gf_w16_table_lazy_multiply_region)
1317 gf_w16_log_zero_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1321 uint16_t *s16, *d16, *top16;
1322 struct gf_w16_zero_logtable_data *ltd;
1325 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1326 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1328 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
1329 gf_do_initial_region_alignment(&rd);
1331 ltd = (struct gf_w16_zero_logtable_data*) ((gf_internal_t *) gf->scratch)->private;
1332 s16 = (uint16_t *) rd.s_start;
1333 d16 = (uint16_t *) rd.d_start;
1334 top16 = (uint16_t *) rd.d_top;
1335 bytes = top16 - d16;
1337 lv = ltd->log_tbl[val];
1340 for (i = 0; i < bytes; i++) {
1341 d16[i] ^= (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]);
1344 for (i = 0; i < bytes; i++) {
1345 d16[i] = (ltd->antilog_tbl[lv + ltd->log_tbl[s16[i]]]);
1349 /* This isn't necessary. */
1351 gf_do_final_region_alignment(&rd);
1354 /* Here -- double-check Kevin */
1359 gf_w16_log_zero_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1361 struct gf_w16_zero_logtable_data *ltd;
1363 ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
1364 return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
1370 gf_w16_log_zero_divide (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1373 struct gf_w16_zero_logtable_data *ltd;
1375 if (a == 0 || b == 0) return 0;
1376 ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
1378 log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
1379 return (ltd->antilog_tbl[log_sum]);
1384 gf_w16_log_zero_inverse (gf_t *gf, gf_val_32_t a)
1386 struct gf_w16_zero_logtable_data *ltd;
1388 ltd = (struct gf_w16_zero_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
1389 return (ltd->inv_tbl[a]);
1395 gf_w16_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1397 uint32_t prod, pp, pmask, amask;
1400 h = (gf_internal_t *) gf->scratch;
1408 while (amask != 0) {
1410 prod = ((prod << 1) ^ pp);
1414 if (a & amask) prod ^= b;
1423 gf_w16_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1425 uint32_t prod, pp, bmask;
1428 h = (gf_internal_t *) gf->scratch;
1435 if (a & 1) prod ^= b;
1437 if (a == 0) return prod;
1439 b = ((b << 1) ^ pp);
1448 gf_w16_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1450 uint64_t *s64, *d64, t1, t2, ta, prod, amask;
1452 struct gf_w16_bytwo_data *btd;
1454 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1455 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1457 btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1459 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1460 gf_do_initial_region_alignment(&rd);
1462 s64 = (uint64_t *) rd.s_start;
1463 d64 = (uint64_t *) rd.d_start;
1466 while (s64 < (uint64_t *) rd.s_top) {
1470 while (amask != 0) {
1471 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1472 if (val & amask) prod ^= ta;
1480 while (s64 < (uint64_t *) rd.s_top) {
1484 while (amask != 0) {
1485 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1486 if (val & amask) prod ^= ta;
1494 gf_do_final_region_alignment(&rd);
1497 #define BYTWO_P_ONESTEP {\
1498 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1499 t1 = _mm_and_si128(v, one); \
1500 t1 = _mm_sub_epi16(t1, one); \
1501 t1 = _mm_and_si128(t1, ta); \
1502 prod = _mm_xor_si128(prod, t1); \
1503 v = _mm_srli_epi64(v, 1); }
1508 gf_w16_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1513 __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1514 struct gf_w16_bytwo_data *btd;
1517 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1518 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1520 btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1522 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1523 gf_do_initial_region_alignment(&rd);
1526 for (i = 0; i < 16; i++) {
1528 if (!(val & (1 << i))) vrev |= 1;
1531 s8 = (uint8_t *) rd.s_start;
1532 d8 = (uint8_t *) rd.d_start;
1534 pp = _mm_set1_epi16(btd->prim_poly&0xffff);
1535 m1 = _mm_set1_epi16((btd->mask1)&0xffff);
1536 m2 = _mm_set1_epi16((btd->mask2)&0xffff);
1537 one = _mm_set1_epi16(1);
1539 while (d8 < (uint8_t *) rd.d_top) {
1540 prod = _mm_setzero_si128();
1541 v = _mm_set1_epi16(vrev);
1542 ta = _mm_load_si128((__m128i *) s8);
1543 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1560 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1564 gf_do_final_region_alignment(&rd);
1571 gf_w16_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
1574 __m128i pp, m1, m2, t1, t2, va;
1576 s8 = (uint8_t *) rd->s_start;
1577 d8 = (uint8_t *) rd->d_start;
1579 pp = _mm_set1_epi16(btd->prim_poly&0xffff);
1580 m1 = _mm_set1_epi16((btd->mask1)&0xffff);
1581 m2 = _mm_set1_epi16((btd->mask2)&0xffff);
1583 while (d8 < (uint8_t *) rd->d_top) {
1584 va = _mm_load_si128 ((__m128i *)(s8));
1585 SSE_AB2(pp, m1, m2, va, t1, t2);
1586 _mm_store_si128((__m128i *)d8, va);
1596 gf_w16_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w16_bytwo_data *btd)
1599 __m128i pp, m1, m2, t1, t2, va, vb;
1601 s8 = (uint8_t *) rd->s_start;
1602 d8 = (uint8_t *) rd->d_start;
1604 pp = _mm_set1_epi16(btd->prim_poly&0xffff);
1605 m1 = _mm_set1_epi16((btd->mask1)&0xffff);
1606 m2 = _mm_set1_epi16((btd->mask2)&0xffff);
1608 while (d8 < (uint8_t *) rd->d_top) {
1609 va = _mm_load_si128 ((__m128i *)(s8));
1610 SSE_AB2(pp, m1, m2, va, t1, t2);
1611 vb = _mm_load_si128 ((__m128i *)(d8));
1612 vb = _mm_xor_si128(vb, va);
1613 _mm_store_si128((__m128i *)d8, vb);
1624 gf_w16_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1628 __m128i pp, m1, m2, t1, t2, va, vb;
1629 struct gf_w16_bytwo_data *btd;
1632 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1633 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1635 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1636 gf_do_initial_region_alignment(&rd);
1638 btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1642 gf_w16_bytwo_b_sse_region_2_xor(&rd, btd);
1644 gf_w16_bytwo_b_sse_region_2_noxor(&rd, btd);
1646 gf_do_final_region_alignment(&rd);
1650 s8 = (uint8_t *) rd.s_start;
1651 d8 = (uint8_t *) rd.d_start;
1653 pp = _mm_set1_epi16(btd->prim_poly&0xffff);
1654 m1 = _mm_set1_epi16((btd->mask1)&0xffff);
1655 m2 = _mm_set1_epi16((btd->mask2)&0xffff);
1657 while (d8 < (uint8_t *) rd.d_top) {
1658 va = _mm_load_si128 ((__m128i *)(s8));
1659 vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1662 if (itb & 1) vb = _mm_xor_si128(vb, va);
1664 if (itb == 0) break;
1665 SSE_AB2(pp, m1, m2, va, t1, t2);
1667 _mm_store_si128((__m128i *)d8, vb);
1672 gf_do_final_region_alignment(&rd);
1678 gf_w16_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1680 uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1681 struct gf_w16_bytwo_data *btd;
1684 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1685 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1687 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1688 gf_do_initial_region_alignment(&rd);
1690 btd = (struct gf_w16_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1691 s64 = (uint64_t *) rd.s_start;
1692 d64 = (uint64_t *) rd.d_start;
1697 while (d64 < (uint64_t *) rd.d_top) {
1699 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1705 while (d64 < (uint64_t *) rd.d_top) {
1707 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1716 while (d64 < (uint64_t *) rd.d_top) {
1719 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1720 *d64 ^= (ta ^ prod);
1725 while (d64 < (uint64_t *) rd.d_top) {
1728 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1737 while (d64 < (uint64_t *) rd.d_top) {
1739 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1740 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1746 while (d64 < (uint64_t *) rd.d_top) {
1748 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1749 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1758 while (d64 < (uint64_t *) rd.d_top) {
1761 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1762 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1763 *d64 ^= (ta ^ prod);
1768 while (d64 < (uint64_t *) rd.d_top) {
1771 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1772 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1781 while (d64 < (uint64_t *) rd.d_top) {
1786 if (tb & 1) prod ^= ta;
1789 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1796 while (d64 < (uint64_t *) rd.d_top) {
1801 if (tb & 1) prod ^= ta;
1804 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1813 gf_do_final_region_alignment(&rd);
1817 int gf_w16_bytwo_init(gf_t *gf)
1820 uint64_t ip, m1, m2;
1821 struct gf_w16_bytwo_data *btd;
1823 h = (gf_internal_t *) gf->scratch;
1824 btd = (struct gf_w16_bytwo_data *) (h->private);
1825 ip = h->prim_poly & 0xffff;
1833 btd->prim_poly |= ip;
1836 ip <<= GF_FIELD_WIDTH;
1837 m1 <<= GF_FIELD_WIDTH;
1838 m2 <<= GF_FIELD_WIDTH;
1841 if (h->mult_type == GF_MULT_BYTWO_p) {
1842 SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
1844 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1845 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
1848 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
1849 if(h->region_type & GF_REGION_SIMD)
1855 SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
1857 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1858 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
1861 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
1862 if(h->region_type & GF_REGION_SIMD)
1873 int gf_w16_log_zero_init(gf_t *gf)
1876 struct gf_w16_zero_logtable_data *ltd;
1879 h = (gf_internal_t *) gf->scratch;
1882 ltd->log_tbl[0] = (-GF_MULT_GROUP_SIZE) + 1;
1884 bzero(&(ltd->_antilog_tbl[0]), sizeof(ltd->_antilog_tbl));
1886 ltd->antilog_tbl = &(ltd->_antilog_tbl[GF_FIELD_SIZE * 2]);
1889 for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
1890 ltd->log_tbl[b] = (uint16_t)i;
1891 ltd->antilog_tbl[i] = (uint16_t)b;
1892 ltd->antilog_tbl[i+GF_MULT_GROUP_SIZE] = (uint16_t)b;
1894 if (b & GF_FIELD_SIZE) {
1895 b = b ^ h->prim_poly;
1898 ltd->inv_tbl[0] = 0; /* Not really, but we need to fill it with something */
1899 ltd->inv_tbl[1] = 1;
1900 for (i = 2; i < GF_FIELD_SIZE; i++) {
1901 ltd->inv_tbl[i] = ltd->antilog_tbl[GF_MULT_GROUP_SIZE-ltd->log_tbl[i]];
1904 SET_FUNCTION(gf,inverse,w32,gf_w16_log_zero_inverse)
1905 SET_FUNCTION(gf,divide,w32,gf_w16_log_zero_divide)
1906 SET_FUNCTION(gf,multiply,w32,gf_w16_log_zero_multiply)
1907 SET_FUNCTION(gf,multiply_region,w32,gf_w16_log_zero_multiply_region)
1913 gf_w16_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1915 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1916 gf_t *base_gf = h->base_gf;
1917 uint8_t b0 = b & 0x00ff;
1918 uint8_t b1 = (b & 0xff00) >> 8;
1919 uint8_t a0 = a & 0x00ff;
1920 uint8_t a1 = (a & 0xff00) >> 8;
1924 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1926 rv = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
1932 gf_w16_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1934 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1935 uint8_t b0 = b & 0x00ff;
1936 uint8_t b1 = (b & 0xff00) >> 8;
1937 uint8_t a0 = a & 0x00ff;
1938 uint8_t a1 = (a & 0xff00) >> 8;
1941 struct gf_w16_composite_data *cd;
1943 cd = (struct gf_w16_composite_data *) h->private;
1944 mt = cd->mult_table;
1946 a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
1948 rv = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) | ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^ GF_W8_INLINE_MULTDIV(mt, a0, b1) ^ GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
1953 * Composite field division trick (explained in 2007 tech report)
1955 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
1959 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
1961 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
1963 * let d = b1c1 and d+1 = b0c0
1965 * solve s*b1c1+b1c0+b0c1 = 0
1967 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
1977 gf_w16_composite_inverse(gf_t *gf, gf_val_32_t a)
1979 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1980 gf_t *base_gf = h->base_gf;
1981 uint8_t a0 = a & 0x00ff;
1982 uint8_t a1 = (a & 0xff00) >> 8;
1983 uint8_t c0, c1, d, tmp;
1985 uint8_t a0inv, a1inv;
1988 a1inv = base_gf->inverse.w32(base_gf, a1);
1989 c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
1991 } else if (a1 == 0) {
1992 c0 = base_gf->inverse.w32(base_gf, a0);
1995 a1inv = base_gf->inverse.w32(base_gf, a1);
1996 a0inv = base_gf->inverse.w32(base_gf, a0);
1998 d = base_gf->multiply.w32(base_gf, a1, a0inv);
2000 tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
2001 tmp = base_gf->inverse.w32(base_gf, tmp);
2003 d = base_gf->multiply.w32(base_gf, d, tmp);
2005 c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
2006 c1 = base_gf->multiply.w32(base_gf, d, a1inv);
2016 gf_w16_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
2018 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2019 gf_t *base_gf = h->base_gf;
2020 uint8_t b0 = val & 0x00ff;
2021 uint8_t b1 = (val & 0xff00) >> 8;
2022 uint16_t *s16, *d16, *top;
2023 uint8_t a0, a1, a1b1, *mt;
2025 struct gf_w16_composite_data *cd;
2027 cd = (struct gf_w16_composite_data *) h->private;
2028 mt = cd->mult_table;
2030 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
2031 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
2040 a0 = (*s16) & 0x00ff;
2041 a1 = ((*s16) & 0xff00) >> 8;
2042 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2044 (*d16) ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
2045 ((base_gf->multiply.w32(base_gf, a1, b0) ^
2046 base_gf->multiply.w32(base_gf, a0, b1) ^
2047 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
2053 a0 = (*s16) & 0x00ff;
2054 a1 = ((*s16) & 0xff00) >> 8;
2055 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2057 (*d16) = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
2058 ((base_gf->multiply.w32(base_gf, a1, b0) ^
2059 base_gf->multiply.w32(base_gf, a0, b1) ^
2060 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 8));
2068 a0 = (*s16) & 0x00ff;
2069 a1 = ((*s16) & 0xff00) >> 8;
2070 a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
2072 (*d16) ^= ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
2073 ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^
2074 GF_W8_INLINE_MULTDIV(mt, a0, b1) ^
2075 GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
2081 a0 = (*s16) & 0x00ff;
2082 a1 = ((*s16) & 0xff00) >> 8;
2083 a1b1 = GF_W8_INLINE_MULTDIV(mt, a1, b1);
2085 (*d16) = ((GF_W8_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
2086 ((GF_W8_INLINE_MULTDIV(mt, a1, b0) ^
2087 GF_W8_INLINE_MULTDIV(mt, a0, b1) ^
2088 GF_W8_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 8));
2098 gf_w16_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
2100 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2101 gf_t *base_gf = h->base_gf;
2102 uint8_t val0 = val & 0x00ff;
2103 uint8_t val1 = (val & 0xff00) >> 8;
2106 uint8_t *slow, *shigh;
2107 uint8_t *dlow, *dhigh, *top;;
2109 /* JSP: I want the two pointers aligned wrt each other on 16 byte
2110 boundaries. So I'm going to make sure that the area on
2111 which the two operate is a multiple of 32. Of course, that
2112 junks up the mapping, but so be it -- that's why we have extract_word.... */
2114 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
2115 gf_do_initial_region_alignment(&rd);
2117 slow = (uint8_t *) rd.s_start;
2118 dlow = (uint8_t *) rd.d_start;
2119 top = (uint8_t *) rd.d_top;
2120 sub_reg_size = (top - dlow)/2;
2121 shigh = slow + sub_reg_size;
2122 dhigh = dlow + sub_reg_size;
2124 base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
2125 base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
2126 base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
2127 base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
2128 base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
2130 gf_do_final_region_alignment(&rd);
2134 int gf_w16_composite_init(gf_t *gf)
2136 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2137 struct gf_w16_composite_data *cd;
2139 if (h->base_gf == NULL) return 0;
2141 cd = (struct gf_w16_composite_data *) h->private;
2142 cd->mult_table = gf_w8_get_mult_table(h->base_gf);
2144 if (h->region_type & GF_REGION_ALTMAP) {
2145 SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region_alt)
2147 SET_FUNCTION(gf,multiply_region,w32,gf_w16_composite_multiply_region)
2150 if (cd->mult_table == NULL) {
2151 SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_recursive)
2153 SET_FUNCTION(gf,multiply,w32,gf_w16_composite_multiply_inline)
2155 SET_FUNCTION(gf,divide,w32,NULL)
2156 SET_FUNCTION(gf,inverse,w32,gf_w16_composite_inverse)
2163 gf_w16_group_4_set_shift_tables(uint16_t *shift, uint16_t val, gf_internal_t *h)
2168 for (i = 0; i < 16; i += 2) {
2169 j = (shift[i>>1] << 1);
2170 if (j & (1 << 16)) j ^= h->prim_poly;
2179 gf_w16_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
2181 uint16_t p, l, ind, r, a16;
2183 struct gf_w16_group_4_4_data *d44;
2184 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2186 d44 = (struct gf_w16_group_4_4_data *) h->private;
2187 gf_w16_group_4_set_shift_tables(d44->shift, b, h);
2192 p = d44->shift[ind];
2197 p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
2202 p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
2206 p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
2211 void gf_w16_group_4_4_region_multiply(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
2213 uint16_t p, l, ind, r, a16, p16;
2214 struct gf_w16_group_4_4_data *d44;
2216 uint16_t *s16, *d16, *top;
2218 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
2219 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
2221 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2222 d44 = (struct gf_w16_group_4_4_data *) h->private;
2223 gf_w16_group_4_set_shift_tables(d44->shift, val, h);
2225 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
2226 gf_do_initial_region_alignment(&rd);
2228 s16 = (uint16_t *) rd.s_start;
2229 d16 = (uint16_t *) rd.d_start;
2230 top = (uint16_t *) rd.d_top;
2234 p16 = (xor) ? *d16 : 0;
2237 p = d44->shift[ind];
2242 p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
2247 p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
2251 p = (d44->shift[ind] ^ d44->reduce[l] ^ (r << 4));
2257 gf_do_final_region_alignment(&rd);
2261 int gf_w16_group_init(gf_t *gf)
2264 struct gf_w16_group_4_4_data *d44;
2265 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2267 d44 = (struct gf_w16_group_4_4_data *) h->private;
2269 for (i = 0; i < 16; i++) {
2271 for (j = 0; j < 4; j++) {
2272 if (i & (1 << j)) p ^= (h->prim_poly << j);
2274 d44->reduce[p>>16] = (p&0xffff);
2277 SET_FUNCTION(gf,multiply,w32,gf_w16_group_4_4_multiply)
2278 SET_FUNCTION(gf,divide,w32,NULL)
2279 SET_FUNCTION(gf,inverse,w32,NULL)
2280 SET_FUNCTION(gf,multiply_region,w32,gf_w16_group_4_4_region_multiply)
2285 int gf_w16_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2290 return sizeof(gf_internal_t) + sizeof(struct gf_w16_lazytable_data) + 64;
2292 case GF_MULT_BYTWO_p:
2293 case GF_MULT_BYTWO_b:
2294 return sizeof(gf_internal_t) + sizeof(struct gf_w16_bytwo_data);
2296 case GF_MULT_LOG_ZERO:
2297 return sizeof(gf_internal_t) + sizeof(struct gf_w16_zero_logtable_data) + 64;
2299 case GF_MULT_LOG_TABLE:
2300 return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
2302 case GF_MULT_DEFAULT:
2303 case GF_MULT_SPLIT_TABLE:
2304 if (arg1 == 8 && arg2 == 8) {
2305 return sizeof(gf_internal_t) + sizeof(struct gf_w16_split_8_8_data) + 64;
2306 } else if ((arg1 == 8 && arg2 == 16) || (arg2 == 8 && arg1 == 16)) {
2307 return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
2308 } else if (mult_type == GF_MULT_DEFAULT ||
2309 (arg1 == 4 && arg2 == 16) || (arg2 == 4 && arg1 == 16)) {
2310 return sizeof(gf_internal_t) + sizeof(struct gf_w16_logtable_data) + 64;
2315 return sizeof(gf_internal_t) + sizeof(struct gf_w16_group_4_4_data) + 64;
2317 case GF_MULT_CARRY_FREE:
2318 return sizeof(gf_internal_t);
2321 return sizeof(gf_internal_t);
2323 case GF_MULT_COMPOSITE:
2324 return sizeof(gf_internal_t) + sizeof(struct gf_w16_composite_data) + 64;
2333 int gf_w16_init(gf_t *gf)
2337 h = (gf_internal_t *) gf->scratch;
2339 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2341 if (h->prim_poly == 0) {
2342 if (h->mult_type == GF_MULT_COMPOSITE) {
2343 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2344 if (h->prim_poly == 0) return 0;
2347 /* Allen: use the following primitive polynomial to make
2348 carryless multiply work more efficiently for GF(2^16).
2350 h->prim_poly = 0x1002d;
2352 The following is the traditional primitive polynomial for GF(2^16) */
2354 h->prim_poly = 0x1100b;
2358 if (h->mult_type != GF_MULT_COMPOSITE) h->prim_poly |= (1 << 16);
2360 SET_FUNCTION(gf,multiply,w32,NULL)
2361 SET_FUNCTION(gf,divide,w32,NULL)
2362 SET_FUNCTION(gf,inverse,w32,NULL)
2363 SET_FUNCTION(gf,multiply_region,w32,NULL)
2365 switch(h->mult_type) {
2366 case GF_MULT_LOG_ZERO: if (gf_w16_log_zero_init(gf) == 0) return 0; break;
2367 case GF_MULT_LOG_TABLE: if (gf_w16_log_init(gf) == 0) return 0; break;
2368 case GF_MULT_DEFAULT:
2369 case GF_MULT_SPLIT_TABLE: if (gf_w16_split_init(gf) == 0) return 0; break;
2370 case GF_MULT_TABLE: if (gf_w16_table_init(gf) == 0) return 0; break;
2371 case GF_MULT_CARRY_FREE: if (gf_w16_cfm_init(gf) == 0) return 0; break;
2372 case GF_MULT_SHIFT: if (gf_w16_shift_init(gf) == 0) return 0; break;
2373 case GF_MULT_COMPOSITE: if (gf_w16_composite_init(gf) == 0) return 0; break;
2374 case GF_MULT_BYTWO_p:
2375 case GF_MULT_BYTWO_b: if (gf_w16_bytwo_init(gf) == 0) return 0; break;
2376 case GF_MULT_GROUP: if (gf_w16_group_init(gf) == 0) return 0; break;
2379 if (h->divide_type == GF_DIVIDE_EUCLID) {
2380 SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
2381 SET_FUNCTION(gf,inverse,w32,gf_w16_euclid)
2382 } else if (h->divide_type == GF_DIVIDE_MATRIX) {
2383 SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
2384 SET_FUNCTION(gf,inverse,w32,gf_w16_matrix)
2387 if (gf->divide.w32 == NULL) {
2388 SET_FUNCTION(gf,divide,w32,gf_w16_divide_from_inverse)
2389 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_euclid)
2392 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w16_inverse_from_divide)
2394 if (h->region_type & GF_REGION_ALTMAP) {
2395 if (h->mult_type == GF_MULT_COMPOSITE) {
2396 SET_FUNCTION(gf,extract_word,w32,gf_w16_composite_extract_word)
2398 SET_FUNCTION(gf,extract_word,w32,gf_w16_split_extract_word)
2400 } else if (h->region_type == GF_REGION_CAUCHY) {
2401 SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2402 SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2404 SET_FUNCTION(gf,extract_word,w32,gf_w16_extract_word)
2406 if (gf->multiply_region.w32 == NULL) {
2407 SET_FUNCTION(gf,multiply_region,w32,gf_w16_multiply_region_from_single)
2412 /* Inline setup functions */
2414 uint16_t *gf_w16_get_log_table(gf_t *gf)
2416 struct gf_w16_logtable_data *ltd;
2418 if (gf->multiply.w32 == gf_w16_log_multiply) {
2419 ltd = (struct gf_w16_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
2420 return (uint16_t *) ltd->log_tbl;
2425 uint16_t *gf_w16_get_mult_alog_table(gf_t *gf)
2428 struct gf_w16_logtable_data *ltd;
2430 h = (gf_internal_t *) gf->scratch;
2431 if (gf->multiply.w32 == gf_w16_log_multiply) {
2432 ltd = (struct gf_w16_logtable_data *) h->private;
2433 return (uint16_t *) ltd->antilog_tbl;
2438 uint16_t *gf_w16_get_div_alog_table(gf_t *gf)
2441 struct gf_w16_logtable_data *ltd;
2443 h = (gf_internal_t *) gf->scratch;
2444 if (gf->multiply.w32 == gf_w16_log_multiply) {
2445 ltd = (struct gf_w16_logtable_data *) h->private;
2446 return (uint16_t *) ltd->d_antilog;