IDA_new/gf-complete/src/gf_w128.c

   1 /*
   2  * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
   3  * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
   4  * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
   5  *
   6  * gf_w128.c
   7  *
   8  * Routines for 128-bit Galois fields
   9  */
  10
  11 #include "gf_int.h"
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include "gf_cpu.h"
  15
  16 #define GF_FIELD_WIDTH (128)
  17
  18 #define two_x(a) {\
  19   a[0] <<= 1; \
  20   if (a[1] & 1ULL << 63) a[0] ^= 1; \
  21   a[1] <<= 1; }
  22
  23 #define a_get_b(a, i, b, j) {\
  24   a[i] = b[j]; \
  25   a[i + 1] = b[j + 1];}
  26
  27 #define set_zero(a, i) {\
  28   a[i] = 0; \
  29   a[i + 1] = 0;}
  30
  31 struct gf_w128_split_4_128_data {
  32   uint64_t last_value[2];
  33   uint64_t tables[2][32][16];
  34 };
  35
  36 struct gf_w128_split_8_128_data {
  37   uint64_t last_value[2];
  38   uint64_t tables[2][16][256];
  39 };
  40
  41 typedef struct gf_group_tables_s {
  42   gf_val_128_t m_table;
  43   gf_val_128_t r_table;
  44 } gf_group_tables_t;
  45
  46 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
  47
  48 static
  49 void
  50 gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
  51 int xor)
  52 {
  53     uint32_t i;
  54     gf_val_128_t s128;
  55     gf_val_128_t d128;
  56     uint64_t c128[2];
  57     gf_region_data rd;
  58
  59     /* We only do this to check on alignment. */
  60     gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
  61
  62     if (val[0] == 0) {
  63       if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
  64       if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  65     }
  66
  67     set_zero(c128, 0);
  68
  69     s128 = (gf_val_128_t) src;
  70     d128 = (gf_val_128_t) dest;
  71
  72     if (xor) {
  73       for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
  74         gf->multiply.w128(gf, &s128[i], val, c128);
  75         d128[i] ^= c128[0];
  76         d128[i+1] ^= c128[1];
  77       }
  78     } else {
  79       for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
  80         gf->multiply.w128(gf, &s128[i], val, &d128[i]);
  81       }
  82     }
  83 }
  84
  85 #if defined(INTEL_SSE4_PCLMUL)
  86 static
  87 void
  88 gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
  89 int xor)
  90 {
  91     uint32_t i;
  92     gf_val_128_t s128;
  93     gf_val_128_t d128;
  94     gf_region_data rd;
  95     __m128i     a,b;
  96     __m128i     result0,result1;
  97     __m128i     prim_poly;
  98     __m128i     c,d,e,f;
  99     gf_internal_t * h = gf->scratch;
 100     prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
 101     /* We only do this to check on alignment. */
 102     gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
 103
 104     if (val[0] == 0) {
 105       if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
 106       if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 107     }
 108
 109     s128 = (gf_val_128_t) src;
 110     d128 = (gf_val_128_t) dest;
 111
 112     if (xor) {
 113       for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
 114         a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
 115         b = _mm_insert_epi64 (a, val[1], 0);
 116         a = _mm_insert_epi64 (a, s128[i], 1);
 117         b = _mm_insert_epi64 (b, val[0], 1);
 118
 119         c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
 120         f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
 121         e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
 122         d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
 123
 124         /* now reusing a and b as temporary variables*/
 125         result0 = _mm_setzero_si128();
 126         result1 = result0;
 127
 128         result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
 129         a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
 130         result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
 131
 132         a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
 133         result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
 134         result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
 135         /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
 136
 137         a = _mm_srli_si128 (result0, 8);
 138         b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
 139         result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
 140         result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
 141
 142         a = _mm_insert_epi64 (result0, 0, 1);
 143         b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
 144         result1 = _mm_xor_si128 (result1, b);
 145         d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
 146         d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
 147       }
 148     } else {
 149       for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
 150         a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
 151         b = _mm_insert_epi64 (a, val[1], 0);
 152         a = _mm_insert_epi64 (a, s128[i], 1);
 153         b = _mm_insert_epi64 (b, val[0], 1);
 154
 155         c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
 156         f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
 157         e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
 158         d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
 159
 160         /* now reusing a and b as temporary variables*/
 161         result0 = _mm_setzero_si128();
 162         result1 = result0;
 163
 164         result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
 165         a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
 166         result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
 167
 168         a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
 169         result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
 170         result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
 171         /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
 172
 173         a = _mm_srli_si128 (result0, 8);
 174         b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
 175         result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
 176         result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
 177
 178         a = _mm_insert_epi64 (result0, 0, 1);
 179         b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
 180         result1 = _mm_xor_si128 (result1, b);
 181         d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
 182         d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
 183       }
 184     }
 185 }
 186 #endif
 187
 188 /*
 189  * Some w128 notes:
 190  * --Big Endian
 191  * --return values allocated beforehand
 192  */
 193
 194 #define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)
 195
 196 void
 197 gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 198 {
 199   /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */
 200   uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit;
 201   gf_internal_t *h;
 202
 203   h = (gf_internal_t *) gf->scratch;
 204
 205   if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) {
 206     set_zero(c128, 0);
 207     return;
 208   }
 209
 210   a_get_b(a, 0, a128, 0);
 211   a_get_b(br, 0, b128, 0);
 212   set_zero(bl, 0);
 213
 214   one = 1;
 215   lbit = (one << 63);
 216
 217   set_zero(pl, 0);
 218   set_zero(pr, 0);
 219
 220   /* Allen: a*b for right half of a */
 221   for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
 222     if (a[1] & (one << i)) {
 223       pl[1] ^= bl[1];
 224       pr[0] ^= br[0];
 225       pr[1] ^= br[1];
 226     }
 227     bl[1] <<= 1;
 228     if (br[0] & lbit) bl[1] ^= 1;
 229     br[0] <<= 1;
 230     if (br[1] & lbit) br[0] ^= 1;
 231     br[1] <<= 1;
 232   }
 233
 234   /* Allen: a*b for left half of a */
 235   for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
 236     if (a[0] & (one << i)) {
 237       pl[0] ^= bl[0];
 238       pl[1] ^= bl[1];
 239       pr[0] ^= br[0];
 240     }
 241     bl[0] <<= 1;
 242     if (bl[1] & lbit) bl[0] ^= 1;
 243     bl[1] <<= 1;
 244     if (br[0] & lbit) bl[1] ^= 1;
 245     br[0] <<= 1;
 246   }
 247
 248   /* Allen: do first half of reduction (based on left quarter of initial product) */
 249   one = lbit >> 1;
 250   ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
 251   ppl[1] = h->prim_poly >> 2;
 252   ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
 253   ppr[1] = 0;
 254   while (one != 0) {
 255     if (pl[0] & one) {
 256       pl[0] ^= ppl[0];
 257       pl[1] ^= ppl[1];
 258       pr[0] ^= ppr[0];
 259       pr[1] ^= ppr[1];
 260     }
 261     one >>= 1;
 262     ppr[1] >>= 1;
 263     if (ppr[0] & 1) ppr[1] ^= lbit;
 264     ppr[0] >>= 1;
 265     if (ppl[1] & 1) ppr[0] ^= lbit;
 266     ppl[1] >>= 1;
 267     if (ppl[0] & 1) ppl[1] ^= lbit;
 268     ppl[0] >>= 1;
 269   }
 270
 271   /* Allen: final half of reduction */
 272   one = lbit;
 273   while (one != 0) {
 274     if (pl[1] & one) {
 275       pl[1] ^= ppl[1];
 276       pr[0] ^= ppr[0];
 277       pr[1] ^= ppr[1];
 278     }
 279     one >>= 1;
 280     ppr[1] >>= 1;
 281     if (ppr[0] & 1) ppr[1] ^= lbit;
 282     ppr[0] >>= 1;
 283     if (ppl[1] & 1) ppr[0] ^= lbit;
 284     ppl[1] >>= 1;
 285   }
 286
 287   /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
 288   c128[0] = pr[0];
 289   c128[1] = pr[1];
 290
 291   return;
 292 }
 293
 294 #if defined(INTEL_SSE4_PCLMUL)
 295
 296 void
 297 gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 298 {
 299     __m128i     a,b;
 300     __m128i     result0,result1;
 301     __m128i     prim_poly;
 302     __m128i     c,d,e,f;
 303     gf_internal_t * h = gf->scratch;
 304
 305     a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
 306     b = _mm_insert_epi64 (a, b128[1], 0);
 307     a = _mm_insert_epi64 (a, a128[0], 1);
 308     b = _mm_insert_epi64 (b, b128[0], 1);
 309
 310     prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
 311
 312     /* we need to test algorithm 2 later*/
 313     c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
 314     f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
 315     e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
 316     d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
 317
 318     /* now reusing a and b as temporary variables*/
 319     result0 = _mm_setzero_si128();
 320     result1 = result0;
 321
 322     result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
 323     a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
 324     result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
 325
 326     a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
 327     result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
 328     result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
 329     /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
 330
 331     a = _mm_srli_si128 (result0, 8);
 332     b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
 333     result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
 334     result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
 335
 336     a = _mm_insert_epi64 (result0, 0, 1);
 337     b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
 338     result1 = _mm_xor_si128 (result1, b);
 339
 340     c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
 341     c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
 342 }
 343 #endif
 344
 345 void
 346 gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 347 {
 348   uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
 349   uint64_t topbit; /* this is used as a boolean value */
 350   gf_internal_t *h;
 351
 352   h = (gf_internal_t *) gf->scratch;
 353   pp = h->prim_poly;
 354   prod[0] = 0;
 355   prod[1] = 0;
 356   pmask = 0x8000000000000000ULL;
 357   amask[0] = 0x8000000000000000ULL;
 358   amask[1] = 0;
 359
 360   while (amask[1] != 0 || amask[0] != 0) {
 361     topbit = (prod[0] & pmask);
 362     prod[0] <<= 1;
 363     if (prod[1] & pmask) prod[0] ^= 1;
 364     prod[1] <<= 1;
 365     if (topbit) prod[1] ^= pp;
 366     if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
 367       prod[0] ^= b128[0];
 368       prod[1] ^= b128[1];
 369     }
 370     amask[1] >>= 1;
 371     if (amask[0] & 1) amask[1] ^= pmask;
 372     amask[0] >>= 1;
 373   }
 374   c128[0] = prod [0];
 375   c128[1] = prod [1];
 376   return;
 377 }
 378
 379 #if defined(INTEL_SSE4)
 380 void
 381 gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 382 {
 383   int i;
 384   __m128i a, b, pp, prod, amask, u_middle_one;
 385   /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
 386   uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
 387   gf_internal_t *h;
 388
 389
 390   h = (gf_internal_t *) gf->scratch;
 391   pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
 392   prod = _mm_setzero_si128();
 393   a = _mm_insert_epi64(prod, a128[1], 0x0);
 394   a = _mm_insert_epi64(a, a128[0], 0x1);
 395   b = _mm_insert_epi64(prod, b128[1], 0x0);
 396   b = _mm_insert_epi64(b, b128[0], 0x1);
 397   pmask = 0x80000000;
 398   amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
 399   u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
 400
 401   for (i = 0; i < 64; i++) {
 402     topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
 403     middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
 404     prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
 405     if (middlebit) {
 406       prod = _mm_xor_si128(prod, u_middle_one);
 407     }
 408     if (topbit) {
 409       prod = _mm_xor_si128(prod, pp);
 410     }
 411     if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
 412       prod = _mm_xor_si128(prod, b);
 413     }
 414     amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
 415   }
 416   amask = _mm_insert_epi32(amask, (gf_val_32_t)1 << 31, 0x1);
 417   for (i = 64; i < 128; i++) {
 418     topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
 419     middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
 420     prod = _mm_slli_epi64(prod, 1);
 421     if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
 422     if (topbit) prod = _mm_xor_si128(prod, pp);
 423     if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
 424       prod = _mm_xor_si128(prod, b);
 425     }
 426     amask = _mm_srli_epi64(amask, 1);
 427   }
 428   c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
 429   c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
 430   return;
 431 }
 432 #endif
 433
 434
 435 /* Ben: This slow function implements sse instrutions for bytwo_b because why not */
 436 #if defined(INTEL_SSE4)
 437 void
 438 gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 439 {
 440   __m128i a, b, lmask, hmask, pp, c, middle_one;
 441   gf_internal_t *h;
 442   uint64_t topbit, middlebit;
 443
 444   h = (gf_internal_t *) gf->scratch;
 445
 446   c = _mm_setzero_si128();
 447   lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
 448   hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
 449   b = _mm_insert_epi64(c, a128[0], 1);
 450   b = _mm_insert_epi64(b, a128[1], 0);
 451   a = _mm_insert_epi64(c, b128[0], 1);
 452   a = _mm_insert_epi64(a, b128[1], 0);
 453   pp = _mm_insert_epi64(c, h->prim_poly, 0);
 454   middle_one = _mm_insert_epi64(c, 1, 0x1);
 455
 456   while (1) {
 457     if (_mm_extract_epi32(a, 0x0) & 1) {
 458       c = _mm_xor_si128(c, b);
 459     }
 460     middlebit = (_mm_extract_epi32(a, 0x2) & 1);
 461     a = _mm_srli_epi64(a, 1);
 462     if (middlebit) a = _mm_xor_si128(a, lmask);
 463     if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
 464       c128[0] = _mm_extract_epi64(c, 0x1);
 465       c128[1] = _mm_extract_epi64(c, 0x0);
 466       return;
 467     }
 468     topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
 469     middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
 470     b = _mm_slli_epi64(b, 1);
 471     if (middlebit) b = _mm_xor_si128(b, middle_one);
 472     if (topbit) b = _mm_xor_si128(b, pp);
 473   }
 474 }
 475 #endif
 476
 477 void
 478 gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 479 {
 480   uint64_t bmask, pp;
 481   gf_internal_t *h;
 482   uint64_t a[2], b[2], c[2];
 483
 484   h = (gf_internal_t *) gf->scratch;
 485
 486   bmask = (1ULL << 63);
 487   set_zero(c, 0);
 488   b[0] = a128[0];
 489   b[1] = a128[1];
 490   a[0] = b128[0];
 491   a[1] = b128[1];
 492
 493   while (1) {
 494     if (a[1] & 1) {
 495       c[0] ^= b[0];
 496       c[1] ^= b[1];
 497     }
 498     a[1] >>= 1;
 499     if (a[0] & 1) a[1] ^= bmask;
 500     a[0] >>= 1;
 501     if (a[1] == 0 && a[0] == 0) {
 502       c128[0] = c[0];
 503       c128[1] = c[1];
 504       return;
 505     }
 506     pp = (b[0] & bmask);
 507     b[0] <<= 1;
 508     if (b[1] & bmask) b[0] ^= 1;
 509     b[1] <<= 1;
 510     if (pp) b[1] ^= h->prim_poly;
 511   }
 512 }
 513
 514 static
 515 void
 516 gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
 517 {
 518   int i, j, k;
 519   uint64_t pp;
 520   gf_internal_t *h;
 521   uint64_t *s64, *d64, *top;
 522   gf_region_data rd;
 523   uint64_t v[2], s;
 524   struct gf_w128_split_4_128_data *ld;
 525
 526   /* We only do this to check on alignment. */
 527   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
 528
 529   if (val[0] == 0) {
 530     if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
 531     if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 532   }
 533
 534   h = (gf_internal_t *) gf->scratch;
 535   ld = (struct gf_w128_split_4_128_data *) h->private;
 536
 537   s64 = (uint64_t *) rd.s_start;
 538   d64 = (uint64_t *) rd.d_start;
 539   top = (uint64_t *) rd.d_top;
 540
 541   if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
 542     v[0] = val[0];
 543     v[1] = val[1];
 544     for (i = 0; i < 32; i++) {
 545       ld->tables[0][i][0] = 0;
 546       ld->tables[1][i][0] = 0;
 547       for (j = 1; j < 16; j <<= 1) {
 548         for (k = 0; k < j; k++) {
 549           ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
 550           ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
 551         }
 552         pp = (v[0] & (1ULL << 63));
 553         v[0] <<= 1;
 554         if (v[1] & (1ULL << 63)) v[0] ^= 1;
 555         v[1] <<= 1;
 556         if (pp) v[1] ^= h->prim_poly;
 557       }
 558     }
 559   }
 560   ld->last_value[0] = val[0];
 561   ld->last_value[1] = val[1];
 562
 563 /*
 564   for (i = 0; i < 32; i++) {
 565     for (j = 0; j < 16; j++) {
 566       printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
 567     }
 568     printf("\n");
 569   }
 570  */
 571   while (d64 < top) {
 572     v[0] = (xor) ? d64[0] : 0;
 573     v[1] = (xor) ? d64[1] : 0;
 574     s = s64[1];
 575     i = 0;
 576     while (s != 0) {
 577       v[0] ^= ld->tables[0][i][s&0xf];
 578       v[1] ^= ld->tables[1][i][s&0xf];
 579       s >>= 4;
 580       i++;
 581     }
 582     s = s64[0];
 583     i = 16;
 584     while (s != 0) {
 585       v[0] ^= ld->tables[0][i][s&0xf];
 586       v[1] ^= ld->tables[1][i][s&0xf];
 587       s >>= 4;
 588       i++;
 589     }
 590     d64[0] = v[0];
 591     d64[1] = v[1];
 592     s64 += 2;
 593     d64 += 2;
 594   }
 595 }
 596
 597 #if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
 598 static
 599 void
 600 gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
 601 {
 602   gf_internal_t *h;
 603   int i, j, k;
 604   uint64_t pp, v[2], s, *s64, *d64, *top;
 605   __m128i p, tables[32][16];
 606   struct gf_w128_split_4_128_data *ld;
 607   gf_region_data rd;
 608
 609   if (val[0] == 0) {
 610     if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
 611     if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 612   }
 613
 614   h = (gf_internal_t *) gf->scratch;
 615
 616   /* We only do this to check on alignment. */
 617   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);
 618
 619   /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
 620
 621   gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
 622
 623   s64 = (uint64_t *) rd.s_start;
 624   d64 = (uint64_t *) rd.d_start;
 625   top = (uint64_t *) rd.d_top;
 626
 627   ld = (struct gf_w128_split_4_128_data *) h->private;
 628
 629   if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
 630     v[0] = val[0];
 631     v[1] = val[1];
 632     for (i = 0; i < 32; i++) {
 633       ld->tables[0][i][0] = 0;
 634       ld->tables[1][i][0] = 0;
 635       for (j = 1; j < 16; j <<= 1) {
 636         for (k = 0; k < j; k++) {
 637           ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
 638           ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
 639         }
 640         pp = (v[0] & (1ULL << 63));
 641         v[0] <<= 1;
 642         if (v[1] & (1ULL << 63)) v[0] ^= 1;
 643         v[1] <<= 1;
 644         if (pp) v[1] ^= h->prim_poly;
 645       }
 646     }
 647   }
 648
 649   ld->last_value[0] = val[0];
 650   ld->last_value[1] = val[1];
 651
 652   for (i = 0; i < 32; i++) {
 653     for (j = 0; j < 16; j++) {
 654       v[0] = ld->tables[0][i][j];
 655       v[1] = ld->tables[1][i][j];
 656       tables[i][j] = _mm_loadu_si128((__m128i *) v);
 657
 658 /*
 659       printf("%2d %2d: ", i, j);
 660       MM_PRINT8("", tables[i][j]); */
 661     }
 662   }
 663
 664   while (d64 != top) {
 665
 666     if (xor) {
 667       p = _mm_load_si128 ((__m128i *) d64);
 668     } else {
 669       p = _mm_setzero_si128();
 670     }
 671     s = *s64;
 672     s64++;
 673     for (i = 0; i < 16; i++) {
 674       j = (s&0xf);
 675       s >>= 4;
 676       p = _mm_xor_si128(p, tables[16+i][j]);
 677     }
 678     s = *s64;
 679     s64++;
 680     for (i = 0; i < 16; i++) {
 681       j = (s&0xf);
 682       s >>= 4;
 683       p = _mm_xor_si128(p, tables[i][j]);
 684     }
 685     _mm_store_si128((__m128i *) d64, p);
 686     d64 += 2;
 687   }
 688
 689   /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
 690
 691   gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
 692 }
 693 #endif
 694
 695 #if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
 696 static
 697 void
 698 gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
 699 {
 700   gf_internal_t *h;
 701   int i, j, k;
 702   uint64_t pp, v[2], *s64, *d64, *top;
 703   __m128i si, tables[32][16], p[16], v0, mask1;
 704   struct gf_w128_split_4_128_data *ld;
 705   uint8_t btable[16];
 706   gf_region_data rd;
 707
 708   if (val[0] == 0) {
 709     if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
 710     if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 711   }
 712
 713   h = (gf_internal_t *) gf->scratch;
 714
 715   /* We only do this to check on alignment. */
 716   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
 717
 718   /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
 719
 720   gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
 721
 722   s64 = (uint64_t *) rd.s_start;
 723   d64 = (uint64_t *) rd.d_start;
 724   top = (uint64_t *) rd.d_top;
 725
 726   ld = (struct gf_w128_split_4_128_data *) h->private;
 727
 728   if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
 729     v[0] = val[0];
 730     v[1] = val[1];
 731     for (i = 0; i < 32; i++) {
 732       ld->tables[0][i][0] = 0;
 733       ld->tables[1][i][0] = 0;
 734       for (j = 1; j < 16; j <<= 1) {
 735         for (k = 0; k < j; k++) {
 736           ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
 737           ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
 738         }
 739         pp = (v[0] & (1ULL << 63));
 740         v[0] <<= 1;
 741         if (v[1] & (1ULL << 63)) v[0] ^= 1;
 742         v[1] <<= 1;
 743         if (pp) v[1] ^= h->prim_poly;
 744       }
 745     }
 746   }
 747
 748   ld->last_value[0] = val[0];
 749   ld->last_value[1] = val[1];
 750
 751   for (i = 0; i < 32; i++) {
 752     for (j = 0; j < 16; j++) {
 753       for (k = 0; k < 16; k++) {
 754         btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
 755         ld->tables[1-(j/8)][i][k] >>= 8;
 756       }
 757       tables[i][j] = _mm_loadu_si128((__m128i *) btable);
 758 /*
 759       printf("%2d %2d: ", i, j);
 760       MM_PRINT8("", tables[i][j]);
 761  */
 762     }
 763   }
 764
 765
 766   mask1 = _mm_set1_epi8(0xf);
 767
 768   while (d64 != top) {
 769
 770     if (xor) {
 771       for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
 772     } else {
 773       for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
 774     }
 775     i = 0;
 776     for (k = 0; k < 16; k++) {
 777       v0 = _mm_load_si128((__m128i *) s64);
 778       s64 += 2;
 779
 780       si = _mm_and_si128(v0, mask1);
 781
 782       for (j = 0; j < 16; j++) {
 783         p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
 784       }
 785       i++;
 786       v0 = _mm_srli_epi32(v0, 4);
 787       si = _mm_and_si128(v0, mask1);
 788       for (j = 0; j < 16; j++) {
 789         p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
 790       }
 791       i++;
 792     }
 793     for (i = 0; i < 16; i++) {
 794       _mm_store_si128((__m128i *) d64, p[i]);
 795       d64 += 2;
 796     }
 797   }
 798   /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
 799
 800   gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
 801 }
 802 #endif
 803
 804 static
 805 void
 806 gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
 807 {
 808   int i, j, k;
 809   uint64_t pp;
 810   gf_internal_t *h;
 811   uint64_t *s64, *d64, *top;
 812   gf_region_data rd;
 813   uint64_t v[2], s;
 814   struct gf_w128_split_8_128_data *ld;
 815
 816   /* Check on alignment. Ignore it otherwise. */
 817   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
 818
 819   if (val[0] == 0) {
 820     if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
 821     if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 822   }
 823
 824   h = (gf_internal_t *) gf->scratch;
 825   ld = (struct gf_w128_split_8_128_data *) h->private;
 826
 827   s64 = (uint64_t *) rd.s_start;
 828   d64 = (uint64_t *) rd.d_start;
 829   top = (uint64_t *) rd.d_top;
 830
 831   if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
 832     v[0] = val[0];
 833     v[1] = val[1];
 834     for (i = 0; i < 16; i++) {
 835       ld->tables[0][i][0] = 0;
 836       ld->tables[1][i][0] = 0;
 837       for (j = 1; j < (1 << 8); j <<= 1) {
 838         for (k = 0; k < j; k++) {
 839           ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
 840           ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
 841         }
 842         pp = (v[0] & (1ULL << 63));
 843         v[0] <<= 1;
 844         if (v[1] & (1ULL << 63)) v[0] ^= 1;
 845         v[1] <<= 1;
 846         if (pp) v[1] ^= h->prim_poly;
 847       }
 848     }
 849   }
 850   ld->last_value[0] = val[0];
 851   ld->last_value[1] = val[1];
 852
 853   while (d64 < top) {
 854     v[0] = (xor) ? d64[0] : 0;
 855     v[1] = (xor) ? d64[1] : 0;
 856     s = s64[1];
 857     i = 0;
 858     while (s != 0) {
 859       v[0] ^= ld->tables[0][i][s&0xff];
 860       v[1] ^= ld->tables[1][i][s&0xff];
 861       s >>= 8;
 862       i++;
 863     }
 864     s = s64[0];
 865     i = 8;
 866     while (s != 0) {
 867       v[0] ^= ld->tables[0][i][s&0xff];
 868       v[1] ^= ld->tables[1][i][s&0xff];
 869       s >>= 8;
 870       i++;
 871     }
 872     d64[0] = v[0];
 873     d64[1] = v[1];
 874     s64 += 2;
 875     d64 += 2;
 876   }
 877 }
 878
 879 void
 880 gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
 881 {
 882   uint64_t bmask, pp;
 883   gf_internal_t *h;
 884   uint64_t a[2], c[2], b[2], *s64, *d64, *top;
 885   gf_region_data rd;
 886
 887   /* We only do this to check on alignment. */
 888   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
 889
 890   if (val[0] == 0) {
 891     if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
 892     if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 893   }
 894
 895   h = (gf_internal_t *) gf->scratch;
 896   s64 = (uint64_t *) rd.s_start;
 897   d64 = (uint64_t *) rd.d_start;
 898   top = (uint64_t *) rd.d_top;
 899   bmask = (1ULL << 63);
 900
 901   while (d64 < top) {
 902     set_zero(c, 0);
 903     b[0] = s64[0];
 904     b[1] = s64[1];
 905     a[0] = val[0];
 906     a[1] = val[1];
 907
 908     while (a[0] != 0) {
 909       if (a[1] & 1) {
 910         c[0] ^= b[0];
 911         c[1] ^= b[1];
 912       }
 913       a[1] >>= 1;
 914       if (a[0] & 1) a[1] ^= bmask;
 915       a[0] >>= 1;
 916       pp = (b[0] & bmask);
 917       b[0] <<= 1;
 918       if (b[1] & bmask) b[0] ^= 1;
 919       b[1] <<= 1;
 920       if (pp) b[1] ^= h->prim_poly;
 921     }
 922     while (1) {
 923       if (a[1] & 1) {
 924         c[0] ^= b[0];
 925         c[1] ^= b[1];
 926       }
 927       a[1] >>= 1;
 928       if (a[1] == 0) break;
 929       pp = (b[0] & bmask);
 930       b[0] <<= 1;
 931       if (b[1] & bmask) b[0] ^= 1;
 932       b[1] <<= 1;
 933       if (pp) b[1] ^= h->prim_poly;
 934     }
 935     if (xor) {
 936       d64[0] ^= c[0];
 937       d64[1] ^= c[1];
 938     } else {
 939       d64[0] = c[0];
 940       d64[1] = c[1];
 941     }
 942     s64 += 2;
 943     d64 += 2;
 944   }
 945 }
 946
 947 static
 948 void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
 949 {
 950   int i, j;
 951   int g_m;
 952   uint64_t prim_poly, lbit;
 953   gf_internal_t *scratch;
 954   gf_group_tables_t *gt;
 955   uint64_t a128[2];
 956   scratch = (gf_internal_t *) gf->scratch;
 957   gt = scratch->private;
 958   g_m = scratch->arg1;
 959   prim_poly = scratch->prim_poly;
 960
 961
 962   set_zero(gt->m_table, 0);
 963   a_get_b(gt->m_table, 2, b128, 0);
 964   lbit = 1;
 965   lbit <<= 63;
 966
 967   for (i = 2; i < (1 << g_m); i <<= 1) {
 968     a_get_b(a128, 0, gt->m_table, 2 * (i >> 1));
 969     two_x(a128);
 970     a_get_b(gt->m_table, 2 * i, a128, 0);
 971     if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly;
 972     for (j = 0; j < i; j++) {
 973       gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)];
 974       gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1];
 975     }
 976   }
 977   return;
 978 }
 979
 980 void
 981 gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
 982 {
 983   int i;
 984   /* index_r, index_m, total_m (if g_r > g_m) */
 985   int i_r, i_m, t_m;
 986   int mask_m, mask_r;
 987   int g_m, g_r;
 988   uint64_t p_i[2], a[2];
 989   gf_internal_t *scratch;
 990   gf_group_tables_t *gt;
 991
 992   scratch = (gf_internal_t *) gf->scratch;
 993   gt = scratch->private;
 994   g_m = scratch->arg1;
 995   g_r = scratch->arg2;
 996
 997   mask_m = (1 << g_m) - 1;
 998   mask_r = (1 << g_r) - 1;
 999
1000   if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
1001     gf_w128_group_m_init(gf, b128);
1002   }
1003
1004   p_i[0] = 0;
1005   p_i[1] = 0;
1006   a[0] = a128[0];
1007   a[1] = a128[1];
1008
1009   t_m = 0;
1010   i_r = 0;
1011
1012   /* Top 64 bits */
1013   for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1014     i_m = (a[0] >> (i * g_m)) & mask_m;
1015     i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1016     p_i[0] <<= g_m;
1017     p_i[0] ^= (p_i[1] >> (64-g_m));
1018     p_i[1] <<= g_m;
1019     p_i[0] ^= gt->m_table[2 * i_m];
1020     p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1021     t_m += g_m;
1022     if (t_m == g_r) {
1023       p_i[1] ^= gt->r_table[i_r];
1024       t_m = 0;
1025       i_r = 0;
1026     } else {
1027       i_r <<= g_m;
1028     }
1029   }
1030
1031   for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1032     i_m = (a[1] >> (i * g_m)) & mask_m;
1033     i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1034     p_i[0] <<= g_m;
1035     p_i[0] ^= (p_i[1] >> (64-g_m));
1036     p_i[1] <<= g_m;
1037     p_i[0] ^= gt->m_table[2 * i_m];
1038     p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1039     t_m += g_m;
1040     if (t_m == g_r) {
1041       p_i[1] ^= gt->r_table[i_r];
1042       t_m = 0;
1043       i_r = 0;
1044     } else {
1045       i_r <<= g_m;
1046     }
1047   }
1048   c128[0] = p_i[0];
1049   c128[1] = p_i[1];
1050 }
1051
1052 static
1053 void
1054 gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
1055 {
1056   int i;
1057   int i_r, i_m, t_m;
1058   int mask_m, mask_r;
1059   int g_m, g_r;
1060   uint64_t p_i[2], a[2];
1061   gf_internal_t *scratch;
1062   gf_group_tables_t *gt;
1063   gf_region_data rd;
1064   uint64_t *a128, *c128, *top;
1065
1066   /* We only do this to check on alignment. */
1067   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
1068
1069   if (val[0] == 0) {
1070     if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
1071     if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1072   }
1073
1074   scratch = (gf_internal_t *) gf->scratch;
1075   gt = scratch->private;
1076   g_m = scratch->arg1;
1077   g_r = scratch->arg2;
1078
1079   mask_m = (1 << g_m) - 1;
1080   mask_r = (1 << g_r) - 1;
1081
1082   if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) {
1083     gf_w128_group_m_init(gf, val);
1084   }
1085
1086   a128 = (uint64_t *) src;
1087   c128 = (uint64_t *) dest;
1088   top = (uint64_t *) rd.d_top;
1089
1090   while (c128 < top) {
1091     p_i[0] = 0;
1092     p_i[1] = 0;
1093     a[0] = a128[0];
1094     a[1] = a128[1];
1095
1096     t_m = 0;
1097     i_r = 0;
1098
1099     /* Top 64 bits */
1100     for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1101       i_m = (a[0] >> (i * g_m)) & mask_m;
1102       i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1103       p_i[0] <<= g_m;
1104       p_i[0] ^= (p_i[1] >> (64-g_m));
1105       p_i[1] <<= g_m;
1106
1107       p_i[0] ^= gt->m_table[2 * i_m];
1108       p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1109       t_m += g_m;
1110       if (t_m == g_r) {
1111         p_i[1] ^= gt->r_table[i_r];
1112         t_m = 0;
1113         i_r = 0;
1114       } else {
1115         i_r <<= g_m;
1116       }
1117     }
1118     for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
1119       i_m = (a[1] >> (i * g_m)) & mask_m;
1120       i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
1121       p_i[0] <<= g_m;
1122       p_i[0] ^= (p_i[1] >> (64-g_m));
1123       p_i[1] <<= g_m;
1124       p_i[0] ^= gt->m_table[2 * i_m];
1125       p_i[1] ^= gt->m_table[(2 * i_m) + 1];
1126       t_m += g_m;
1127       if (t_m == g_r) {
1128         p_i[1] ^= gt->r_table[i_r];
1129         t_m = 0;
1130         i_r = 0;
1131       } else {
1132         i_r <<= g_m;
1133       }
1134     }
1135
1136     if (xor) {
1137       c128[0] ^= p_i[0];
1138       c128[1] ^= p_i[1];
1139     } else {
1140       c128[0] = p_i[0];
1141       c128[1] = p_i[1];
1142     }
1143     a128 += 2;
1144     c128 += 2;
1145   }
1146 }
1147
1148 /* a^-1 -> b */
1149 void
1150 gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1151 {
1152   uint64_t e_i[2], e_im1[2], e_ip1[2];
1153   uint64_t d_i, d_im1, d_ip1;
1154   uint64_t y_i[2], y_im1[2], y_ip1[2];
1155   uint64_t c_i[2];
1156   uint64_t *b;
1157   uint64_t one = 1;
1158
1159   /* This needs to return some sort of error (in b128?) */
1160   if (a128[0] == 0 && a128[1] == 0) return;
1161
1162   b = (uint64_t *) b128;
1163
1164   e_im1[0] = 0;
1165   e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly;
1166   e_i[0] = a128[0];
1167   e_i[1] = a128[1];
1168   d_im1 = 128;
1169
1170   //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
1171   //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
1172   //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not
1173
1174   for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
1175
1176   //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet
1177
1178   if (!((one << d_i) & e_i[0])) {
1179
1180     //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
1181     // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.
1182
1183     for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;
1184
1185   } else {
1186
1187     //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
1188
1189     d_i += 64;
1190   }
1191   y_i[0] = 0;
1192   y_i[1] = 1;
1193   y_im1[0] = 0;
1194   y_im1[1] = 0;
1195
1196   while (!(e_i[0] == 0 && e_i[1] == 1)) {
1197
1198     e_ip1[0] = e_im1[0];
1199     e_ip1[1] = e_im1[1];
1200     d_ip1 = d_im1;
1201     c_i[0] = 0;
1202     c_i[1] = 0;
1203
1204     while (d_ip1 >= d_i) {
1205       if ((d_ip1 - d_i) >= 64) {
1206         c_i[0] ^= (one << ((d_ip1 - d_i) - 64));
1207         e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64));
1208       } else {
1209         c_i[1] ^= (one << (d_ip1 - d_i));
1210         e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i));
1211         if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
1212         e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
1213       }
1214       d_ip1--;
1215       if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
1216       while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
1217       while (d_ip1 <  64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
1218     }
1219     gf->multiply.w128(gf, c_i, y_i, y_ip1);
1220     y_ip1[0] ^= y_im1[0];
1221     y_ip1[1] ^= y_im1[1];
1222
1223     y_im1[0] = y_i[0];
1224     y_im1[1] = y_i[1];
1225
1226     y_i[0] = y_ip1[0];
1227     y_i[1] = y_ip1[1];
1228
1229     e_im1[0] = e_i[0];
1230     e_im1[1] = e_i[1];
1231     d_im1 = d_i;
1232     e_i[0] = e_ip1[0];
1233     e_i[1] = e_ip1[1];
1234     d_i = d_ip1;
1235   }
1236
1237   b[0] = y_i[0];
1238   b[1] = y_i[1];
1239   return;
1240 }
1241
1242 void
1243 gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
1244 {
1245   uint64_t d[2];
1246   gf->inverse.w128(gf, b128, d);
1247   gf->multiply.w128(gf, a128, d, c128);
1248   return;
1249 }
1250
1251 void
1252 gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1253 {
1254   uint64_t one128[2];
1255   one128[0] = 0;
1256   one128[1] = 1;
1257   gf->divide.w128(gf, one128, a128, b128);
1258   return;
1259 }
1260
1261
1262 static
1263 void
1264 gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
1265 {
1266   gf_internal_t *h = (gf_internal_t *) gf->scratch;
1267   gf_t *base_gf = h->base_gf;
1268   uint64_t a0 = a[1];
1269   uint64_t a1 = a[0];
1270   uint64_t c0, c1, d, tmp;
1271   uint64_t a0inv, a1inv;
1272
1273   if (a0 == 0) {
1274     a1inv = base_gf->inverse.w64(base_gf, a1);
1275     c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
1276     c1 = a1inv;
1277   } else if (a1 == 0) {
1278     c0 = base_gf->inverse.w64(base_gf, a0);
1279     c1 = 0;
1280   } else {
1281     a1inv = base_gf->inverse.w64(base_gf, a1);
1282     a0inv = base_gf->inverse.w64(base_gf, a0);
1283
1284     d = base_gf->multiply.w64(base_gf, a1, a0inv);
1285
1286     tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
1287     tmp = base_gf->inverse.w64(base_gf, tmp);
1288
1289     d = base_gf->multiply.w64(base_gf, d, tmp);
1290
1291     c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
1292     c1 = base_gf->multiply.w64(base_gf, d, a1inv);
1293   }
1294   inv[0] = c1;
1295   inv[1] = c0;
1296 }
1297
1298 static
1299   void
1300 gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
1301 {
1302   gf_internal_t *h = (gf_internal_t *) gf->scratch;
1303   gf_t *base_gf = h->base_gf;
1304   uint64_t b0 = b[1];
1305   uint64_t b1 = b[0];
1306   uint64_t a0 = a[1];
1307   uint64_t a1 = a[0];
1308   uint64_t a1b1;
1309
1310   a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
1311
1312   rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
1313   rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^
1314     base_gf->multiply.w64(base_gf, a0, b1) ^
1315     base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
1316 }
1317
1318 static
1319   void
1320 gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
1321 {
1322   gf_internal_t *h = (gf_internal_t *) gf->scratch;
1323   gf_t *base_gf = h->base_gf;
1324   uint64_t b0 = val[1];
1325   uint64_t b1 = val[0];
1326   uint64_t *s64, *d64;
1327   uint64_t *top;
1328   uint64_t a0, a1, a1b1;
1329   gf_region_data rd;
1330
1331   if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
1332
1333   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
1334
1335   s64 = rd.s_start;
1336   d64 = rd.d_start;
1337   top = rd.d_top;
1338
1339   if (xor) {
1340     while (d64 < top) {
1341       a1 = s64[0];
1342       a0 = s64[1];
1343       a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
1344
1345       d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
1346       d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^
1347           base_gf->multiply.w64(base_gf, a0, b1) ^
1348           base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
1349       s64 += 2;
1350       d64 += 2;
1351     }
1352   } else {
1353     while (d64 < top) {
1354       a1 = s64[0];
1355       a0 = s64[1];
1356       a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
1357
1358       d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
1359       d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^
1360           base_gf->multiply.w64(base_gf, a0, b1) ^
1361           base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
1362       s64 += 2;
1363       d64 += 2;
1364     }
1365   }
1366 }
1367
1368 static
1369 void
1370 gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int
1371     xor)
1372 {
1373   gf_internal_t *h = (gf_internal_t *) gf->scratch;  gf_t *base_gf = h->base_gf;
1374   gf_val_64_t val0 = val[1];
1375   gf_val_64_t val1 = val[0];
1376   uint8_t *slow, *shigh;
1377   uint8_t *dlow, *dhigh, *top;
1378   int sub_reg_size;
1379   gf_region_data rd;
1380
1381   gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
1382   gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
1383
1384   slow = (uint8_t *) rd.s_start;
1385   dlow = (uint8_t *) rd.d_start;
1386   top = (uint8_t*) rd.d_top;
1387   sub_reg_size = (top - dlow)/2;
1388   shigh = slow + sub_reg_size;
1389   dhigh = dlow + sub_reg_size;
1390
1391   base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
1392   base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
1393   base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
1394   base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
1395   base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
1396         ), sub_reg_size, 1);
1397
1398   gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
1399 }
1400
1401
1402   static
1403 int gf_w128_composite_init(gf_t *gf)
1404 {
1405   gf_internal_t *h = (gf_internal_t *) gf->scratch;
1406
1407   if (h->region_type & GF_REGION_ALTMAP) {
1408     SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt)
1409   } else {
1410     SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region)
1411   }
1412
1413   SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply)
1414   SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
1415   SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse)
1416
1417   return 1;
1418 }
1419
1420 static
1421 int gf_w128_cfm_init(gf_t *gf)
1422 {
1423 #if defined(INTEL_SSE4_PCLMUL)
1424   if (gf_cpu_supports_intel_pclmul) {
1425     SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1426     SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1427     SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
1428     return 1;
1429   }
1430 #endif
1431
1432   return 0;
1433 }
1434
1435 static
1436 int gf_w128_shift_init(gf_t *gf)
1437 {
1438   SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply)
1439   SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1440   SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single)
1441   return 1;
1442 }
1443
1444   static
1445 int gf_w128_bytwo_init(gf_t *gf)
1446 {
1447   gf_internal_t *h;
1448   h = (gf_internal_t *) gf->scratch;
1449
1450   if (h->mult_type == GF_MULT_BYTWO_p) {
1451     SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
1452     /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/
1453     /* John: the sse function is slower.*/
1454   } else {
1455     SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply)
1456     /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply)
1457 Ben: This sse function is also slower. */
1458   }
1459   SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1460   SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region)
1461   return 1;
1462 }
1463
1464 /*
1465  * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
1466  * bits in all of these numbers.
1467  */
1468   static
1469 void gf_w128_group_r_init(gf_t *gf)
1470 {
1471   int i, j;
1472   int g_r;
1473   uint64_t pp;
1474   gf_internal_t *scratch;
1475   gf_group_tables_t *gt;
1476   scratch = (gf_internal_t *) gf->scratch;
1477   gt = scratch->private;
1478   g_r = scratch->arg2;
1479   pp = scratch->prim_poly;
1480
1481   gt->r_table[0] = 0;
1482   for (i = 1; i < (1 << g_r); i++) {
1483     gt->r_table[i] = 0;
1484     for (j = 0; j < g_r; j++) {
1485       if (i & (1 << j)) {
1486         gt->r_table[i] ^= (pp << j);
1487       }
1488     }
1489   }
1490   return;
1491 }
1492
1493 #if 0 // defined(INTEL_SSE4)
1494   static
1495 void gf_w128_group_r_sse_init(gf_t *gf)
1496 {
1497   int i, j;
1498   int g_r;
1499   uint64_t pp;
1500   gf_internal_t *scratch;
1501   gf_group_tables_t *gt;
1502   scratch = (gf_internal_t *) gf->scratch;
1503   gt = scratch->private;
1504   __m128i zero = _mm_setzero_si128();
1505   __m128i *table = (__m128i *)(gt->r_table);
1506   g_r = scratch->arg2;
1507   pp = scratch->prim_poly;
1508   table[0] = zero;
1509   for (i = 1; i < (1 << g_r); i++) {
1510     table[i] = zero;
1511     for (j = 0; j < g_r; j++) {
1512       if (i & (1 << j)) {
1513         table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
1514       }
1515     }
1516   }
1517   return;
1518 }
1519 #endif
1520
1521   static
1522 int gf_w128_split_init(gf_t *gf)
1523 {
1524   struct gf_w128_split_4_128_data *sd4;
1525   struct gf_w128_split_8_128_data *sd8;
1526   gf_internal_t *h;
1527
1528   h = (gf_internal_t *) gf->scratch;
1529
1530   SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
1531 #if defined(INTEL_SSE4_PCLMUL)
1532   if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
1533     SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1534   }
1535 #endif
1536
1537   SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1538
1539   if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
1540     sd8 = (struct gf_w128_split_8_128_data *) h->private;
1541     sd8->last_value[0] = 0;
1542     sd8->last_value[1] = 0;
1543     SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region)
1544   } else {
1545     sd4 = (struct gf_w128_split_4_128_data *) h->private;
1546     sd4->last_value[0] = 0;
1547     sd4->last_value[1] = 0;
1548     if((h->region_type & GF_REGION_ALTMAP))
1549     {
1550       #ifdef INTEL_SSE4
1551         if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1552           SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
1553         else
1554       #endif
1555           return 0;
1556     }
1557     else {
1558       #ifdef INTEL_SSE4
1559         if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1560           SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
1561         else
1562       #endif
1563         SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
1564     }
1565   }
1566   return 1;
1567 }
1568
1569
1570 static
1571 int gf_w128_group_init(gf_t *gf)
1572 {
1573   gf_internal_t *scratch;
1574   gf_group_tables_t *gt;
1575   int g_r, size_r;
1576
1577   scratch = (gf_internal_t *) gf->scratch;
1578   gt = scratch->private;
1579   g_r = scratch->arg2;
1580   size_r = (1 << g_r);
1581
1582   gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *)));
1583   gt->m_table = gt->r_table + size_r;
1584   gt->m_table[2] = 0;
1585   gt->m_table[3] = 0;
1586
1587   SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply)
1588   SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1589   SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region)
1590
1591   gf_w128_group_r_init(gf);
1592
1593   return 1;
1594 }
1595
1596 void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
1597 {
1598   gf_val_128_t s;
1599
1600   s = (gf_val_128_t) start;
1601   s += (index * 2);
1602   memcpy(rv, s, 16);
1603 }
1604
1605 static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
1606 {
1607   int i, blocks;
1608   uint64_t *r64, tmp;
1609   uint8_t *r8;
1610   gf_region_data rd;
1611
1612   gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
1613   r64 = (uint64_t *) start;
1614   if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
1615       (r64 + index*2 >= (uint64_t *) rd.d_top)) {
1616     memcpy(rv, r64+(index*2), 16);
1617     return;
1618   }
1619
1620   index -= (((uint64_t *) rd.d_start) - r64)/2;
1621   r64 = (uint64_t *) rd.d_start;
1622
1623   blocks = index/16;
1624   r64 += (blocks*32);
1625   index %= 16;
1626   r8 = (uint8_t *) r64;
1627   r8 += index;
1628   rv[0] = 0;
1629   rv[1] = 0;
1630
1631   for (i = 0; i < 8; i++) {
1632     tmp = *r8;
1633     rv[1] |= (tmp << (i*8));
1634     r8 += 16;
1635   }
1636
1637   for (i = 0; i < 8; i++) {
1638     tmp = *r8;
1639     rv[0] |= (tmp << (i*8));
1640     r8 += 16;
1641   }
1642   return;
1643 }
1644
1645   static
1646 void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
1647 {
1648   int sub_size;
1649   gf_internal_t *h;
1650   uint8_t *r8, *top;
1651   uint64_t *r64;
1652   gf_region_data rd;
1653
1654   h = (gf_internal_t *) gf->scratch;
1655   gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
1656   r64 = (uint64_t *) start;
1657   if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
1658       (r64 + index*2 >= (uint64_t *) rd.d_top)) {
1659     memcpy(rv, r64+(index*2), 16);
1660     return;
1661   }
1662   index -= (((uint64_t *) rd.d_start) - r64)/2;
1663   r8 = (uint8_t *) rd.d_start;
1664   top = (uint8_t *) rd.d_top;
1665   sub_size = (top-r8)/2;
1666
1667   rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
1668   rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
1669
1670   return;
1671 }
1672
1673 int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
1674 {
1675   int size_m, size_r;
1676   if (divide_type==GF_DIVIDE_MATRIX) return 0;
1677
1678   switch(mult_type)
1679   {
1680     case GF_MULT_CARRY_FREE:
1681       return sizeof(gf_internal_t);
1682       break;
1683     case GF_MULT_SHIFT:
1684       return sizeof(gf_internal_t);
1685       break;
1686     case GF_MULT_BYTWO_p:
1687     case GF_MULT_BYTWO_b:
1688       return sizeof(gf_internal_t);
1689       break;
1690     case GF_MULT_DEFAULT:
1691     case GF_MULT_SPLIT_TABLE:
1692       if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
1693         return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
1694       } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
1695         return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
1696       }
1697       return 0;
1698       break;
1699     case GF_MULT_GROUP:
1700       /* JSP We've already error checked the arguments. */
1701       size_m = (1 << arg1) * 2 * sizeof(uint64_t);
1702       size_r = (1 << arg2) * 2 * sizeof(uint64_t);
1703       /*
1704        * two pointers prepend the table data for structure
1705        * because the tables are of dynamic size
1706        */
1707       return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
1708       break;
1709     case GF_MULT_COMPOSITE:
1710       if (arg1 == 2) {
1711         return sizeof(gf_internal_t) + 4;
1712       } else {
1713         return 0;
1714       }
1715       break;
1716
1717     default:
1718       return 0;
1719    }
1720 }
1721
1722 int gf_w128_init(gf_t *gf)
1723 {
1724   gf_internal_t *h;
1725
1726   h = (gf_internal_t *) gf->scratch;
1727
1728   /* Allen: set default primitive polynomial / irreducible polynomial if needed */
1729
1730   if (h->prim_poly == 0) {
1731     if (h->mult_type == GF_MULT_COMPOSITE) {
1732       h->prim_poly = gf_composite_get_default_poly(h->base_gf);
1733       if (h->prim_poly == 0) return 0; /* This shouldn't happen */
1734     } else {
1735       h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
1736     }
1737   }
1738
1739   SET_FUNCTION(gf,multiply,w128,NULL)
1740   SET_FUNCTION(gf,divide,w128,NULL)
1741   SET_FUNCTION(gf,inverse,w128,NULL)
1742   SET_FUNCTION(gf,multiply_region,w128,NULL)
1743   switch(h->mult_type) {
1744     case GF_MULT_BYTWO_p:
1745     case GF_MULT_BYTWO_b:      if (gf_w128_bytwo_init(gf) == 0) return 0; break;
1746     case GF_MULT_CARRY_FREE:   if (gf_w128_cfm_init(gf) == 0) return 0; break;
1747     case GF_MULT_SHIFT:        if (gf_w128_shift_init(gf) == 0) return 0; break;
1748     case GF_MULT_GROUP:        if (gf_w128_group_init(gf) == 0) return 0; break;
1749     case GF_MULT_DEFAULT:
1750     case GF_MULT_SPLIT_TABLE:  if (gf_w128_split_init(gf) == 0) return 0; break;
1751     case GF_MULT_COMPOSITE:    if (gf_w128_composite_init(gf) == 0) return 0; break;
1752     default: return 0;
1753   }
1754
1755   /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
1756      are multiple flags in h->region_type */
1757   if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
1758     SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word)
1759   } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
1760     SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word)
1761   } else {
1762     SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word)
1763   }
1764
1765   if (h->divide_type == GF_DIVIDE_EUCLID) {
1766     SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
1767   }
1768
1769   if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
1770     SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
1771   }
1772   if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) {
1773     SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide)
1774   }
1775   return 1;
1776 }