2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Routines for 8-bit Galois fields
18 #define AB2(ip, am1 ,am2, b, t1, t2) {\
21 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
22 b = (t1 ^ (t2 & ip));}
24 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
25 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
26 t2 = _mm_and_si128(va, m2); \
27 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
28 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
30 #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
34 uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a)
36 return gf->divide.w32(gf, 1, a);
41 uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
43 b = gf->inverse.w32(gf, b);
44 return gf->multiply.w32(gf, a, b);
49 uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
51 uint32_t e_i, e_im1, e_ip1;
52 uint32_t d_i, d_im1, d_ip1;
53 uint32_t y_i, y_im1, y_ip1;
56 if (b == 0) return -1;
57 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
60 for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
70 while (d_ip1 >= d_i) {
71 c_i ^= (1 << (d_ip1 - d_i));
72 e_ip1 ^= (e_i << (d_ip1 - d_i));
73 if (e_ip1 == 0) return 0;
74 while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
77 y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
91 gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
95 r8 = (uint8_t *) start;
100 gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
108 h = (gf_internal_t *) gf->scratch;
109 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
110 r8 = (uint8_t *) start;
111 if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
112 if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
113 index -= (((uint8_t *) rd.d_start) - r8);
114 r8 = (uint8_t *) rd.d_start;
115 top = (uint8_t *) rd.d_top;
116 sub_size = (top-r8)/2;
118 a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
119 b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
120 return (a | (b << 4));
125 uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
127 return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
131 #if defined(INTEL_SSE4_PCLMUL)
135 gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
143 gf_internal_t * h = gf->scratch;
145 a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
146 b = _mm_insert_epi32 (a, b8, 0);
148 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
150 /* Do the initial multiply */
152 result = _mm_clmulepi64_si128 (a, b, 0);
154 /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
155 have to do the reduction at most twice, because (w-2)/z == 2. Where
156 z is equal to the number of zeros after the leading 1
158 _mm_clmulepi64_si128 is the carryless multiply operation. Here
159 _mm_srli_si128 shifts the result to the right by 1 byte. This allows
160 us to multiply the prim_poly by the leading bits of the result. We
161 then xor the result of that operation back with the result.*/
163 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
164 result = _mm_xor_si128 (result, w);
165 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
166 result = _mm_xor_si128 (result, w);
168 /* Extracts 32 bit value from result. */
170 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
176 #if defined(INTEL_SSE4_PCLMUL)
180 gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
188 gf_internal_t * h = gf->scratch;
190 a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
191 b = _mm_insert_epi32 (a, b8, 0);
193 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
195 /* Do the initial multiply */
197 result = _mm_clmulepi64_si128 (a, b, 0);
199 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
200 result = _mm_xor_si128 (result, w);
201 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
202 result = _mm_xor_si128 (result, w);
203 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
204 result = _mm_xor_si128 (result, w);
206 /* Extracts 32 bit value from result. */
208 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
214 #if defined(INTEL_SSE4_PCLMUL)
218 gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
226 gf_internal_t * h = gf->scratch;
228 a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
229 b = _mm_insert_epi32 (a, b8, 0);
231 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
233 /* Do the initial multiply */
235 result = _mm_clmulepi64_si128 (a, b, 0);
237 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
238 result = _mm_xor_si128 (result, w);
239 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
240 result = _mm_xor_si128 (result, w);
241 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
242 result = _mm_xor_si128 (result, w);
243 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
244 result = _mm_xor_si128 (result, w);
246 /* Extracts 32 bit value from result. */
247 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
256 gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
263 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
264 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
266 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
267 gf_do_initial_region_alignment(&rd);
269 s8 = (uint8_t *) rd.s_start;
270 d8 = (uint8_t *) rd.d_start;
273 while (d8 < ((uint8_t *) rd.d_top)) {
274 *d8 ^= gf->multiply.w32(gf, val, *s8);
279 while (d8 < ((uint8_t *) rd.d_top)) {
280 *d8 = gf->multiply.w32(gf, val, *s8);
285 gf_do_final_region_alignment(&rd);
288 #if defined(INTEL_SSE4_PCLMUL)
291 gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
302 gf_internal_t * h = gf->scratch;
304 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
306 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
307 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
309 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
311 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
312 gf_do_initial_region_alignment(&rd);
314 s8 = (uint8_t *) rd.s_start;
315 d8 = (uint8_t *) rd.d_start;
318 while (d8 < ((uint8_t *) rd.d_top)) {
319 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
320 result = _mm_clmulepi64_si128 (a, b, 0);
321 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
322 result = _mm_xor_si128 (result, w);
323 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
324 result = _mm_xor_si128 (result, w);
325 *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
330 while (d8 < ((uint8_t *) rd.d_top)) {
331 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
332 result = _mm_clmulepi64_si128 (a, b, 0);
333 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
334 result = _mm_xor_si128 (result, w);
335 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
336 result = _mm_xor_si128 (result, w);
337 *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
342 gf_do_final_region_alignment(&rd);
346 #if defined(INTEL_SSE4_PCLMUL)
349 gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
360 gf_internal_t * h = gf->scratch;
362 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
364 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
365 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
367 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
369 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
370 gf_do_initial_region_alignment(&rd);
372 s8 = (uint8_t *) rd.s_start;
373 d8 = (uint8_t *) rd.d_start;
376 while (d8 < ((uint8_t *) rd.d_top)) {
377 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
378 result = _mm_clmulepi64_si128 (a, b, 0);
379 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
380 result = _mm_xor_si128 (result, w);
381 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
382 result = _mm_xor_si128 (result, w);
383 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
384 result = _mm_xor_si128 (result, w);
385 *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
390 while (d8 < ((uint8_t *) rd.d_top)) {
391 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
392 result = _mm_clmulepi64_si128 (a, b, 0);
393 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
394 result = _mm_xor_si128 (result, w);
395 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
396 result = _mm_xor_si128 (result, w);
397 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
398 result = _mm_xor_si128 (result, w);
399 *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
404 gf_do_final_region_alignment(&rd);
408 #if defined(INTEL_SSE4_PCLMUL)
411 gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
422 gf_internal_t * h = gf->scratch;
424 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
426 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
427 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
429 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
431 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
432 gf_do_initial_region_alignment(&rd);
434 s8 = (uint8_t *) rd.s_start;
435 d8 = (uint8_t *) rd.d_start;
438 while (d8 < ((uint8_t *) rd.d_top)) {
439 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
440 result = _mm_clmulepi64_si128 (a, b, 0);
441 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
442 result = _mm_xor_si128 (result, w);
443 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
444 result = _mm_xor_si128 (result, w);
445 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
446 result = _mm_xor_si128 (result, w);
447 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
448 result = _mm_xor_si128 (result, w);
449 *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
454 while (d8 < ((uint8_t *) rd.d_top)) {
455 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
456 result = _mm_clmulepi64_si128 (a, b, 0);
457 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
458 result = _mm_xor_si128 (result, w);
459 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
460 result = _mm_xor_si128 (result, w);
461 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
462 result = _mm_xor_si128 (result, w);
463 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
464 result = _mm_xor_si128 (result, w);
465 *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
470 gf_do_final_region_alignment(&rd);
474 /* ------------------------------------------------------------
475 IMPLEMENTATION: SHIFT:
477 JSP: The world's dumbest multiplication algorithm. I only
478 include it for completeness. It does have the feature that it requires no
485 gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
487 uint16_t product, i, pp, a, b;
492 h = (gf_internal_t *) gf->scratch;
497 for (i = 0; i < GF_FIELD_WIDTH; i++) {
498 if (a & (1 << i)) product ^= (b << i);
500 for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
501 if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
507 int gf_w8_cfm_init(gf_t *gf)
509 #if defined(INTEL_SSE4_PCLMUL)
510 if (gf_cpu_supports_intel_pclmul) {
513 h = (gf_internal_t *) gf->scratch;
515 if ((0xe0 & h->prim_poly) == 0){
516 SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
517 SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
518 }else if ((0xc0 & h->prim_poly) == 0){
519 SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
520 SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
521 }else if ((0x80 & h->prim_poly) == 0){
522 SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
523 SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
529 #elif defined(ARM_NEON)
530 if (gf_cpu_supports_arm_neon) {
531 return gf_w8_neon_cfm_init(gf);
540 int gf_w8_shift_init(gf_t *gf)
542 SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply) /* The others will be set automatically */
546 /* ------------------------------------------------------------
547 IMPLEMENTATION: LOG_TABLE:
549 JSP: Kevin wrote this, and I'm converting it to my structure.
555 gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
557 struct gf_w8_logzero_table_data *ltd;
559 ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
560 return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
566 gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
568 struct gf_w8_logzero_table_data *ltd;
570 ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
571 return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]];
577 gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
579 struct gf_w8_logzero_small_table_data *std;
581 std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
582 if (b == 0) return 0;
583 return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]];
589 gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
591 struct gf_w8_logzero_small_table_data *std;
593 std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
594 return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]];
600 gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
602 struct gf_w8_logtable_data *ltd;
604 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
605 return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
611 gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
614 struct gf_w8_logtable_data *ltd;
616 if (a == 0 || b == 0) return 0;
617 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
619 log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
620 return (ltd->antilog_tbl[log_sum]);
625 gf_w8_log_inverse (gf_t *gf, uint32_t a)
627 struct gf_w8_logtable_data *ltd;
629 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
630 return (ltd->inv_tbl[a]);
635 gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
637 struct gf_w8_logzero_table_data *ltd;
639 ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
640 return (ltd->inv_tbl[a]);
645 gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
647 struct gf_w8_logzero_small_table_data *std;
649 std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
650 return (std->inv_tbl[a]);
655 gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
660 struct gf_w8_logtable_data *ltd;
662 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
663 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
665 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
666 s8 = (uint8_t *) src;
667 d8 = (uint8_t *) dest;
669 lv = ltd->log_tbl[val];
672 for (i = 0; i < bytes; i++) {
673 d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
676 for (i = 0; i < bytes; i++) {
677 d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
684 gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
689 struct gf_w8_logzero_table_data *ltd;
690 struct gf_w8_logzero_small_table_data *std;
695 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
696 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
698 h = (gf_internal_t *) gf->scratch;
701 std = (struct gf_w8_logzero_small_table_data *) h->private;
703 alt = std->antilog_tbl;
705 ltd = (struct gf_w8_logzero_table_data *) h->private;
707 alt = ltd->antilog_tbl;
709 s8 = (uint8_t *) src;
710 d8 = (uint8_t *) dest;
715 for (i = 0; i < bytes; i++) {
716 d8[i] ^= (alt[lv + log[s8[i]]]);
719 for (i = 0; i < bytes; i++) {
720 d8[i] = (alt[lv + log[s8[i]]]);
726 int gf_w8_log_init(gf_t *gf)
729 struct gf_w8_logtable_data *ltd = NULL;
730 struct gf_w8_logzero_table_data *ztd = NULL;
731 struct gf_w8_logzero_small_table_data *std = NULL;
737 h = (gf_internal_t *) gf->scratch;
738 if (h->mult_type == GF_MULT_LOG_TABLE) {
740 alt = ltd->antilog_tbl;
742 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
744 alt = std->antilog_tbl;
745 std->div_tbl = (alt + 255);
749 alt = ztd->antilog_tbl;
750 ztd->inv_tbl = (alt + 512 + 256);
751 ztd->div_tbl = (alt + 255);
755 for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
756 if (h->mult_type == GF_MULT_LOG_TABLE)
758 else if (h->mult_type == GF_MULT_LOG_ZERO)
764 if (h->mult_type == GF_MULT_LOG_TABLE) {
766 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
767 std->log_tbl[0] = 510;
769 ztd->log_tbl[0] = 512;
773 for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
774 if (h->mult_type == GF_MULT_LOG_TABLE) {
775 if (ltd->log_tbl[b] != 0) check = 1;
777 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
778 if (std->log_tbl[b] != 0) check = 1;
781 if (ztd->log_tbl[b] != 0) check = 1;
785 alt[i+GF_MULT_GROUP_SIZE] = b;
787 if (b & GF_FIELD_SIZE) {
788 b = b ^ h->prim_poly;
792 _gf_errno = GF_E_LOGPOLY;
796 if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
798 if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
803 inv[0] = 0; /* Not really, but we need to fill it with something */
805 b = GF_MULT_GROUP_SIZE;
809 if (i & (1 << 8)) i ^= h->prim_poly;
813 if (h->mult_type == GF_MULT_LOG_TABLE) {
814 SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse)
815 SET_FUNCTION(gf,divide,w32,gf_w8_log_divide)
816 SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply)
817 SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region)
818 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
819 SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse)
820 SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide)
821 SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply)
822 SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
824 SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse)
825 SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide)
826 SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply)
827 SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
832 /* ------------------------------------------------------------
833 IMPLEMENTATION: FULL_TABLE:
835 JSP: Kevin wrote this, and I'm converting it to my structure.
840 gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
842 struct gf_w8_single_table_data *ftd;
844 ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
845 return (ftd->multtable[a][b]);
850 gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
852 struct gf_w8_single_table_data *ftd;
854 ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
855 return (ftd->divtable[a][b]);
860 gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
862 struct gf_w8_default_data *ftd;
864 ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
865 return (ftd->multtable[a][b]);
868 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
871 gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
873 struct gf_w8_default_data *ftd;
875 ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
876 return (ftd->divtable[a][b]);
882 gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
884 struct gf_w8_double_table_data *ftd;
886 ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
887 return (ftd->mult[a][b]);
892 gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
894 struct gf_w8_double_table_data *ftd;
896 ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
897 return (ftd->div[a][b]);
902 gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
905 uint32_t b, c, vc, vb;
907 struct gf_w8_double_table_data *dtd;
908 struct gf_w8_double_table_lazy_data *ltd;
911 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
912 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
914 h = (gf_internal_t *) (gf->scratch);
915 if (h->region_type & GF_REGION_LAZY) {
916 ltd = (struct gf_w8_double_table_lazy_data *) h->private;
918 for (b = 0; b < GF_FIELD_SIZE; b++) {
919 vb = (ltd->smult[val][b] << 8);
920 for (c = 0; c < GF_FIELD_SIZE; c++) {
921 vc = ltd->smult[val][c];
922 base[(b << 8)| c] = (vb | vc);
927 dtd = (struct gf_w8_double_table_data *) h->private;
928 base = &(dtd->mult[val][0]);
931 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
932 gf_do_initial_region_alignment(&rd);
933 gf_two_byte_region_table_multiply(&rd, base);
934 gf_do_final_region_alignment(&rd);
939 gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
941 struct gf_w8_double_table_lazy_data *ftd;
943 ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
944 return (ftd->smult[a][b]);
949 gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
951 struct gf_w8_double_table_lazy_data *ftd;
953 ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
954 return (ftd->div[a][b]);
959 gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
963 struct gf_w8_single_table_data *ftd;
965 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
966 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
968 ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
969 s8 = (uint8_t *) src;
970 d8 = (uint8_t *) dest;
973 for (i = 0; i < bytes; i++) {
974 d8[i] ^= ftd->multtable[s8[i]][val];
977 for (i = 0; i < bytes; i++) {
978 d8[i] = ftd->multtable[s8[i]][val];
986 gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
988 uint8_t *bh, *bl, *sptr, *dptr;
989 __m128i loset, t1, r, va, mth, mtl;
990 struct gf_w8_half_table_data *htd;
993 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
994 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
996 htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
998 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
999 gf_do_initial_region_alignment(&rd);
1001 bh = (uint8_t *) htd->high;
1003 bl = (uint8_t *) htd->low;
1009 mth = _mm_loadu_si128 ((__m128i *)(bh));
1010 mtl = _mm_loadu_si128 ((__m128i *)(bl));
1011 loset = _mm_set1_epi8 (0x0f);
1014 while (sptr < (uint8_t *) rd.s_top) {
1015 va = _mm_load_si128 ((__m128i *)(sptr));
1016 t1 = _mm_and_si128 (loset, va);
1017 r = _mm_shuffle_epi8 (mtl, t1);
1018 va = _mm_srli_epi64 (va, 4);
1019 t1 = _mm_and_si128 (loset, va);
1020 r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
1021 va = _mm_load_si128 ((__m128i *)(dptr));
1022 r = _mm_xor_si128 (r, va);
1023 _mm_store_si128 ((__m128i *)(dptr), r);
1028 while (sptr < (uint8_t *) rd.s_top) {
1029 va = _mm_load_si128 ((__m128i *)(sptr));
1030 t1 = _mm_and_si128 (loset, va);
1031 r = _mm_shuffle_epi8 (mtl, t1);
1032 va = _mm_srli_epi64 (va, 4);
1033 t1 = _mm_and_si128 (loset, va);
1034 r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
1035 _mm_store_si128 ((__m128i *)(dptr), r);
1041 gf_do_final_region_alignment(&rd);
1046 /* ------------------------------------------------------------
1047 IMPLEMENTATION: FULL_TABLE:
1052 gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1054 struct gf_w8_half_table_data *htd;
1055 htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
1057 return htd->high[b][a>>4] ^ htd->low[b][a&0xf];
1062 gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1066 struct gf_w8_half_table_data *htd;
1068 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1069 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1071 htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
1072 s8 = (uint8_t *) src;
1073 d8 = (uint8_t *) dest;
1076 for (i = 0; i < bytes; i++) {
1077 d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
1080 for (i = 0; i < bytes; i++) {
1081 d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
1088 int gf_w8_split_init(gf_t *gf)
1091 struct gf_w8_half_table_data *htd;
1094 h = (gf_internal_t *) gf->scratch;
1095 htd = (struct gf_w8_half_table_data *)h->private;
1097 bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
1098 bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
1100 for (a = 1; a < GF_FIELD_SIZE; a++) {
1101 for (b = 1; b < GF_HALF_SIZE; b++) {
1102 htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
1103 htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
1107 SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
1109 #if defined(INTEL_SSSE3)
1110 if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
1111 SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
1113 #elif defined(ARM_NEON)
1114 if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
1115 gf_w8_neon_split_init(gf);
1118 SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
1119 if(h->region_type & GF_REGION_SIMD)
1121 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1128 /* JSP: This is disgusting, but it is what it is. If there is no SSE,
1129 then the default is equivalent to single table. If there is SSE, then
1130 we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
1133 int gf_w8_table_init(gf_t *gf)
1136 struct gf_w8_single_table_data *ftd = NULL;
1137 struct gf_w8_double_table_data *dtd = NULL;
1138 struct gf_w8_double_table_lazy_data *ltd = NULL;
1139 struct gf_w8_default_data *dd = NULL;
1140 int a, b, c, prod, scase;
1142 h = (gf_internal_t *) gf->scratch;
1144 if (h->mult_type == GF_MULT_DEFAULT &&
1145 (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
1146 dd = (struct gf_w8_default_data *)h->private;
1148 bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
1149 bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
1150 bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1151 bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1152 } else if (h->mult_type == GF_MULT_DEFAULT ||
1153 h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
1154 ftd = (struct gf_w8_single_table_data *)h->private;
1155 bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1156 bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1158 } else if (h->region_type == GF_REGION_DOUBLE_TABLE) {
1159 dtd = (struct gf_w8_double_table_data *)h->private;
1160 bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1161 bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
1163 } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
1164 ltd = (struct gf_w8_double_table_lazy_data *)h->private;
1165 bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1166 bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1169 fprintf(stderr, "Internal error in gf_w8_table_init\n");
1173 for (a = 1; a < GF_FIELD_SIZE; a++) {
1174 for (b = 1; b < GF_FIELD_SIZE; b++) {
1175 prod = gf_w8_shift_multiply(gf,a,b);
1178 ftd->multtable[a][b] = prod;
1179 ftd->divtable[prod][b] = a;
1182 dtd->div[prod][b] = a;
1183 for (c = 0; c < GF_FIELD_SIZE; c++) {
1184 dtd->mult[a][(c<<8)|b] |= prod;
1185 dtd->mult[a][(b<<8)|c] |= (prod<<8);
1189 ltd->div[prod][b] = a;
1190 ltd->smult[a][b] = prod;
1193 dd->multtable[a][b] = prod;
1194 dd->divtable[prod][b] = a;
1195 if ((b & 0xf) == b) { dd->low[a][b] = prod; }
1196 if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
1202 SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */
1205 SET_FUNCTION(gf,divide,w32,gf_w8_table_divide)
1206 SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply)
1207 SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region)
1210 SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide)
1211 SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply)
1212 SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
1215 SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide)
1216 SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply)
1217 SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
1220 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1221 if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
1222 SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
1223 SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
1224 #if defined(INTEL_SSSE3)
1225 if (gf_cpu_supports_intel_ssse3) {
1226 SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
1228 #elif defined(ARM_NEON)
1229 if (gf_cpu_supports_arm_neon) {
1230 gf_w8_neon_split_init(gf);
1242 gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1244 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1245 gf_t *base_gf = h->base_gf;
1246 uint8_t val0 = val & 0x0f;
1247 uint8_t val1 = (val & 0xf0) >> 4;
1257 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1258 gf_do_initial_region_alignment(&rd);
1260 sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2;
1262 base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
1263 base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
1264 base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
1265 base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
1266 base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
1268 gf_do_final_region_alignment(&rd);
1273 gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1275 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1276 gf_t *base_gf = h->base_gf;
1277 uint8_t b0 = b & 0x0f;
1278 uint8_t b1 = (b & 0xf0) >> 4;
1279 uint8_t a0 = a & 0x0f;
1280 uint8_t a1 = (a & 0xf0) >> 4;
1283 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1285 return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1286 ((base_gf->multiply.w32(base_gf, a1, b0) ^
1287 base_gf->multiply.w32(base_gf, a0, b1) ^
1288 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1293 gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1295 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1296 uint8_t b0 = b & 0x0f;
1297 uint8_t b1 = (b & 0xf0) >> 4;
1298 uint8_t a0 = a & 0x0f;
1299 uint8_t a1 = (a & 0xf0) >> 4;
1301 struct gf_w8_composite_data *cd;
1303 cd = (struct gf_w8_composite_data *) h->private;
1304 mt = cd->mult_table;
1306 a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1308 return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
1309 ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
1310 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
1311 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1315 * Composite field division trick (explained in 2007 tech report)
1317 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
1321 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
1323 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
1325 * let d = b1c1 and d+1 = b0c0
1327 * solve s*b1c1+b1c0+b0c1 = 0
1329 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
1339 gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
1341 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1342 gf_t *base_gf = h->base_gf;
1343 uint8_t a0 = a & 0x0f;
1344 uint8_t a1 = (a & 0xf0) >> 4;
1345 uint8_t c0, c1, c, d, tmp;
1346 uint8_t a0inv, a1inv;
1349 a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
1350 c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
1352 } else if (a1 == 0) {
1353 c0 = base_gf->inverse.w32(base_gf, a0);
1356 a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
1357 a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf;
1359 d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
1361 tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
1362 tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
1364 d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
1366 c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf;
1367 c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf;
1377 gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1380 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1381 gf_t *base_gf = h->base_gf;
1382 uint8_t b0 = val & 0x0f;
1383 uint8_t b1 = (val & 0xf0) >> 4;
1387 uint8_t a0, a1, a1b1;
1388 struct gf_w8_composite_data *cd;
1390 cd = (struct gf_w8_composite_data *) h->private;
1398 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
1399 gf_do_initial_region_alignment(&rd);
1402 s8 = (uint8_t *) rd.s_start;
1403 d8 = (uint8_t *) rd.d_start;
1405 mt = cd->mult_table;
1408 while (d8 < (uint8_t *) rd.d_top) {
1410 a1 = (*s8 & 0xf0) >> 4;
1411 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1413 *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1414 ((base_gf->multiply.w32(base_gf, a1, b0) ^
1415 base_gf->multiply.w32(base_gf, a0, b1) ^
1416 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1421 while (d8 < (uint8_t *) rd.d_top) {
1423 a1 = (*s8 & 0xf0) >> 4;
1424 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1426 *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1427 ((base_gf->multiply.w32(base_gf, a1, b0) ^
1428 base_gf->multiply.w32(base_gf, a0, b1) ^
1429 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1436 while (d8 < (uint8_t *) rd.d_top) {
1438 a1 = (*s8 & 0xf0) >> 4;
1439 a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1441 *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
1442 ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
1443 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
1444 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1449 while (d8 < (uint8_t *) rd.d_top) {
1451 a1 = (*s8 & 0xf0) >> 4;
1452 a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1454 *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
1455 ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
1456 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
1457 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1463 gf_do_final_region_alignment(&rd);
1468 int gf_w8_composite_init(gf_t *gf)
1470 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1471 struct gf_w8_composite_data *cd;
1473 if (h->base_gf == NULL) return 0;
1475 cd = (struct gf_w8_composite_data *) h->private;
1476 cd->mult_table = gf_w4_get_mult_table(h->base_gf);
1478 if (h->region_type & GF_REGION_ALTMAP) {
1479 SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt)
1481 SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region)
1484 if (cd->mult_table == NULL) {
1485 SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive)
1487 SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline)
1489 SET_FUNCTION(gf,divide,w32,NULL)
1490 SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse)
1498 gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1500 uint32_t prod, pp, pmask, amask;
1503 h = (gf_internal_t *) gf->scratch;
1511 while (amask != 0) {
1513 prod = ((prod << 1) ^ pp);
1517 if (a & amask) prod ^= b;
1526 gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1528 uint32_t prod, pp, bmask;
1531 h = (gf_internal_t *) gf->scratch;
1538 if (a & 1) prod ^= b;
1540 if (a == 0) return prod;
1542 b = ((b << 1) ^ pp);
1551 gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1553 uint64_t *s64, *d64, t1, t2, ta, prod, amask;
1555 struct gf_w8_bytwo_data *btd;
1557 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1558 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1560 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1562 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1563 gf_do_initial_region_alignment(&rd);
1565 s64 = (uint64_t *) rd.s_start;
1566 d64 = (uint64_t *) rd.d_start;
1569 while (s64 < (uint64_t *) rd.s_top) {
1573 while (amask != 0) {
1574 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1575 if (val & amask) prod ^= ta;
1583 while (s64 < (uint64_t *) rd.s_top) {
1587 while (amask != 0) {
1588 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1589 if (val & amask) prod ^= ta;
1597 gf_do_final_region_alignment(&rd);
1600 #define BYTWO_P_ONESTEP {\
1601 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1602 t1 = _mm_and_si128(v, one); \
1603 t1 = _mm_sub_epi8(t1, one); \
1604 t1 = _mm_and_si128(t1, ta); \
1605 prod = _mm_xor_si128(prod, t1); \
1606 v = _mm_srli_epi64(v, 1); }
1611 gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1616 __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1617 struct gf_w8_bytwo_data *btd;
1620 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1621 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1623 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1625 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1626 gf_do_initial_region_alignment(&rd);
1629 for (i = 0; i < 8; i++) {
1631 if (!(val & (1 << i))) vrev |= 1;
1634 s8 = (uint8_t *) rd.s_start;
1635 d8 = (uint8_t *) rd.d_start;
1637 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1638 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1639 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1640 one = _mm_set1_epi8(1);
1642 while (d8 < (uint8_t *) rd.d_top) {
1643 prod = _mm_setzero_si128();
1644 v = _mm_set1_epi8(vrev);
1645 ta = _mm_load_si128((__m128i *) s8);
1646 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1655 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1659 gf_do_final_region_alignment(&rd);
1666 gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
1669 __m128i pp, m1, m2, t1, t2, va;
1671 s8 = (uint8_t *) rd->s_start;
1672 d8 = (uint8_t *) rd->d_start;
1674 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1675 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1676 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1678 while (d8 < (uint8_t *) rd->d_top) {
1679 va = _mm_load_si128 ((__m128i *)(s8));
1680 SSE_AB2(pp, m1, m2, va, t1, t2);
1681 _mm_store_si128((__m128i *)d8, va);
1691 gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
1694 __m128i pp, m1, m2, t1, t2, va, vb;
1696 s8 = (uint8_t *) rd->s_start;
1697 d8 = (uint8_t *) rd->d_start;
1699 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1700 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1701 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1703 while (d8 < (uint8_t *) rd->d_top) {
1704 va = _mm_load_si128 ((__m128i *)(s8));
1705 SSE_AB2(pp, m1, m2, va, t1, t2);
1706 vb = _mm_load_si128 ((__m128i *)(d8));
1707 vb = _mm_xor_si128(vb, va);
1708 _mm_store_si128((__m128i *)d8, vb);
1719 gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1723 __m128i pp, m1, m2, t1, t2, va, vb;
1724 struct gf_w8_bytwo_data *btd;
1727 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1728 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1730 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1731 gf_do_initial_region_alignment(&rd);
1733 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1737 gf_w8_bytwo_b_sse_region_2_xor(&rd, btd);
1739 gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd);
1741 gf_do_final_region_alignment(&rd);
1745 s8 = (uint8_t *) rd.s_start;
1746 d8 = (uint8_t *) rd.d_start;
1748 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1749 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1750 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1752 while (d8 < (uint8_t *) rd.d_top) {
1753 va = _mm_load_si128 ((__m128i *)(s8));
1754 vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1757 if (itb & 1) vb = _mm_xor_si128(vb, va);
1759 if (itb == 0) break;
1760 SSE_AB2(pp, m1, m2, va, t1, t2);
1762 _mm_store_si128((__m128i *)d8, vb);
1767 gf_do_final_region_alignment(&rd);
1773 gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1775 uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1776 struct gf_w8_bytwo_data *btd;
1779 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1780 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1782 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1783 gf_do_initial_region_alignment(&rd);
1785 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1786 s64 = (uint64_t *) rd.s_start;
1787 d64 = (uint64_t *) rd.d_start;
1792 while (d64 < (uint64_t *) rd.d_top) {
1794 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1800 while (d64 < (uint64_t *) rd.d_top) {
1802 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1811 while (d64 < (uint64_t *) rd.d_top) {
1814 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1815 *d64 ^= (ta ^ prod);
1820 while (d64 < (uint64_t *) rd.d_top) {
1823 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1832 while (d64 < (uint64_t *) rd.d_top) {
1834 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1835 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1841 while (d64 < (uint64_t *) rd.d_top) {
1843 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1844 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1853 while (d64 < (uint64_t *) rd.d_top) {
1856 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1857 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1858 *d64 ^= (ta ^ prod);
1863 while (d64 < (uint64_t *) rd.d_top) {
1866 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1867 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1876 while (d64 < (uint64_t *) rd.d_top) {
1878 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1880 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1881 *d64 ^= (ta ^ prod);
1886 while (d64 < (uint64_t *) rd.d_top) {
1888 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1890 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1900 while (d64 < (uint64_t *) rd.d_top) {
1903 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1905 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1906 *d64 ^= (ta ^ prod);
1911 while (d64 < (uint64_t *) rd.d_top) {
1914 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1916 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1926 while (d64 < (uint64_t *) rd.d_top) {
1928 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1929 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1930 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1936 while (d64 < (uint64_t *) rd.d_top) {
1938 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1939 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1940 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1950 while (d64 < (uint64_t *) rd.d_top) {
1953 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1954 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1955 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1956 *d64 ^= (ta ^ prod);
1961 while (d64 < (uint64_t *) rd.d_top) {
1964 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1965 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1966 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1975 while (d64 < (uint64_t *) rd.d_top) {
1977 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1979 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1980 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1981 *d64 ^= (ta ^ prod);
1986 while (d64 < (uint64_t *) rd.d_top) {
1988 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1990 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1991 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2000 while (d64 < (uint64_t *) rd.d_top) {
2003 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2005 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2006 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2007 *d64 ^= (ta ^ prod);
2012 while (d64 < (uint64_t *) rd.d_top) {
2015 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2017 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2018 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2027 while (d64 < (uint64_t *) rd.d_top) {
2029 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2030 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2032 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2033 *d64 ^= (ta ^ prod);
2038 while (d64 < (uint64_t *) rd.d_top) {
2040 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2041 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2043 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2052 while (d64 < (uint64_t *) rd.d_top) {
2055 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2056 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2058 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2059 *d64 ^= (ta ^ prod);
2064 while (d64 < (uint64_t *) rd.d_top) {
2067 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2068 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2070 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2079 while (d64 < (uint64_t *) rd.d_top) {
2081 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2083 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2085 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2086 *d64 ^= (ta ^ prod);
2091 while (d64 < (uint64_t *) rd.d_top) {
2093 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2095 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2097 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2106 while (d64 < (uint64_t *) rd.d_top) {
2109 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2111 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2113 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2114 *d64 ^= (ta ^ prod);
2119 while (d64 < (uint64_t *) rd.d_top) {
2122 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2124 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2126 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2136 while (d64 < (uint64_t *) rd.d_top) {
2141 if (tb & 1) prod ^= ta;
2144 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2151 while (d64 < (uint64_t *) rd.d_top) {
2156 if (tb & 1) prod ^= ta;
2159 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2168 gf_do_final_region_alignment(&rd);
2172 int gf_w8_bytwo_init(gf_t *gf)
2175 uint64_t ip, m1, m2;
2176 struct gf_w8_bytwo_data *btd;
2178 h = (gf_internal_t *) gf->scratch;
2179 btd = (struct gf_w8_bytwo_data *) (h->private);
2180 ip = h->prim_poly & 0xff;
2188 btd->prim_poly |= ip;
2191 ip <<= GF_FIELD_WIDTH;
2192 m1 <<= GF_FIELD_WIDTH;
2193 m2 <<= GF_FIELD_WIDTH;
2196 if (h->mult_type == GF_MULT_BYTWO_p) {
2197 SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
2199 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
2200 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
2203 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
2204 if(h->region_type & GF_REGION_SIMD)
2210 SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
2212 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
2213 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
2216 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
2217 if(h->region_type & GF_REGION_SIMD)
2227 /* ------------------------------------------------------------
2229 You don't need to error check here on in init, because it's done
2230 for you in gf_error_check().
2233 int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2237 case GF_MULT_DEFAULT:
2238 if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
2239 return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
2241 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2243 if (region_type == GF_REGION_CAUCHY) {
2244 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2247 if (region_type == GF_REGION_DEFAULT) {
2248 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2250 if (region_type & GF_REGION_DOUBLE_TABLE) {
2251 if (region_type == GF_REGION_DOUBLE_TABLE) {
2252 return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64;
2253 } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
2254 return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
2261 case GF_MULT_BYTWO_p:
2262 case GF_MULT_BYTWO_b:
2263 return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
2265 case GF_MULT_SPLIT_TABLE:
2266 if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
2267 return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
2270 case GF_MULT_LOG_TABLE:
2271 return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
2273 case GF_MULT_LOG_ZERO:
2274 return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
2276 case GF_MULT_LOG_ZERO_EXT:
2277 return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
2279 case GF_MULT_CARRY_FREE:
2280 return sizeof(gf_internal_t);
2283 return sizeof(gf_internal_t);
2285 case GF_MULT_COMPOSITE:
2286 return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
2293 int gf_w8_init(gf_t *gf)
2297 h = (gf_internal_t *) gf->scratch;
2299 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2301 if (h->prim_poly == 0) {
2302 if (h->mult_type == GF_MULT_COMPOSITE) {
2303 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2304 if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */
2306 h->prim_poly = 0x11d;
2309 if (h->mult_type != GF_MULT_COMPOSITE) {
2310 h->prim_poly |= 0x100;
2313 SET_FUNCTION(gf,multiply,w32,NULL)
2314 SET_FUNCTION(gf,divide,w32,NULL)
2315 SET_FUNCTION(gf,inverse,w32,NULL)
2316 SET_FUNCTION(gf,multiply_region,w32,NULL)
2317 SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word)
2319 switch(h->mult_type) {
2320 case GF_MULT_DEFAULT:
2321 case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break;
2322 case GF_MULT_BYTWO_p:
2323 case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break;
2324 case GF_MULT_LOG_ZERO:
2325 case GF_MULT_LOG_ZERO_EXT:
2326 case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break;
2327 case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break;
2328 case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break;
2329 case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break;
2330 case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break;
2334 if (h->divide_type == GF_DIVIDE_EUCLID) {
2335 SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2336 SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
2337 } else if (h->divide_type == GF_DIVIDE_MATRIX) {
2338 SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2339 SET_FUNCTION(gf,inverse,w32,gf_w8_matrix)
2342 if (gf->divide.w32 == NULL) {
2343 SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2344 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
2347 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide)
2349 if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
2350 SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word)
2353 if (h->region_type == GF_REGION_CAUCHY) {
2354 SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2355 SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2358 if (gf->multiply_region.w32 == NULL) {
2359 SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single)
2366 /* Inline setup functions */
2368 uint8_t *gf_w8_get_mult_table(gf_t *gf)
2371 struct gf_w8_default_data *ftd;
2372 struct gf_w8_single_table_data *std;
2374 h = (gf_internal_t *) gf->scratch;
2375 if (gf->multiply.w32 == gf_w8_default_multiply) {
2376 ftd = (struct gf_w8_default_data *) h->private;
2377 return (uint8_t *) ftd->multtable;
2378 } else if (gf->multiply.w32 == gf_w8_table_multiply) {
2379 std = (struct gf_w8_single_table_data *) h->private;
2380 return (uint8_t *) std->multtable;
2385 uint8_t *gf_w8_get_div_table(gf_t *gf)
2387 struct gf_w8_default_data *ftd;
2388 struct gf_w8_single_table_data *std;
2390 if (gf->multiply.w32 == gf_w8_default_multiply) {
2391 ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
2392 return (uint8_t *) ftd->divtable;
2393 } else if (gf->multiply.w32 == gf_w8_table_multiply) {
2394 std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
2395 return (uint8_t *) std->divtable;