2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Generic routines for Galois fields
17 int _gf_errno = GF_E_DEFAULT;
24 case GF_E_DEFAULT: s = "No Error."; break;
25 case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
26 case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
27 case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
28 case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
29 case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
30 case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
31 case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
32 case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
33 case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
34 case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
35 case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
36 case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
37 case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
38 case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
39 case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
40 case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
41 case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
42 case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
43 case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
44 case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
45 case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
46 case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
47 case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
48 case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
49 case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
50 case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
51 case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
52 case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
53 case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
54 case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
55 case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
56 case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
57 case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
58 case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
59 case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
60 case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
61 case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
62 case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
63 case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
64 case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
65 case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
66 case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
67 case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
68 case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
69 case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
70 case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
71 case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
72 case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
73 case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
74 case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
75 case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
76 case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
77 case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
78 case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
79 case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
80 case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
81 case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
82 case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
83 case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
84 case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
85 case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
86 case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
87 case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
88 case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
89 case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
90 case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
91 case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
92 case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
93 case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
94 case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
95 case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
96 case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
97 case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
98 case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
99 case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
100 case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
101 case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
102 case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
103 case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
104 case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
105 case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
106 case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
107 case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
108 case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
109 case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
110 case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
111 case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
112 case GF_E_UNK_REG: s = "Unknown region type."; break;
113 case GF_E_UNK_DIV: s = "Unknown division type."; break;
114 default: s = "Undefined error.";
117 fprintf(stderr, "%s\n", s);
120 uint64_t gf_composite_get_default_poly(gf_t *base)
125 h = (gf_internal_t *) base->scratch;
127 if (h->mult_type == GF_MULT_COMPOSITE) return 0;
128 if (h->prim_poly == 0x13) return 2;
132 if (h->mult_type == GF_MULT_COMPOSITE) return 0;
133 if (h->prim_poly == 0x11d) return 3;
137 if (h->mult_type == GF_MULT_COMPOSITE) {
138 rv = gf_composite_get_default_poly(h->base_gf);
139 if (rv != h->prim_poly) return 0;
140 if (rv == 3) return 0x105;
143 if (h->prim_poly == 0x1100b) return 2;
144 if (h->prim_poly == 0x1002d) return 7;
149 if (h->mult_type == GF_MULT_COMPOSITE) {
150 rv = gf_composite_get_default_poly(h->base_gf);
151 if (rv != h->prim_poly) return 0;
152 if (rv == 2) return 0x10005;
153 if (rv == 7) return 0x10008;
154 if (rv == 0x105) return 0x10002;
157 if (h->prim_poly == 0x400007) return 2;
158 if (h->prim_poly == 0xc5) return 3;
163 if (h->mult_type == GF_MULT_COMPOSITE) {
164 rv = gf_composite_get_default_poly(h->base_gf);
165 if (rv != h->prim_poly) return 0;
166 if (rv == 3) return 0x100000009ULL;
167 if (rv == 2) return 0x100000004ULL;
168 if (rv == 0x10005) return 0x100000003ULL;
169 if (rv == 0x10002) return 0x100000005ULL;
170 if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too,
171 but I want to differentiate cases). */
174 if (h->prim_poly == 0x1bULL) return 2;
181 int gf_error_check(int w, int mult_type, int region_type, int divide_type,
182 int arg1, int arg2, uint64_t poly, gf_t *base)
187 int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
190 rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
191 rquad = (region_type & GF_REGION_QUAD_TABLE);
192 rlazy = (region_type & GF_REGION_LAZY);
193 rsimd = (region_type & GF_REGION_SIMD);
194 rnosimd = (region_type & GF_REGION_NOSIMD);
195 raltmap = (region_type & GF_REGION_ALTMAP);
196 rcauchy = (region_type & GF_REGION_CAUCHY);
198 if (divide_type != GF_DIVIDE_DEFAULT &&
199 divide_type != GF_DIVIDE_MATRIX &&
200 divide_type != GF_DIVIDE_EUCLID) {
201 _gf_errno = GF_E_UNK_DIV;
205 tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
206 GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
208 if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
211 if (gf_cpu_supports_intel_sse2) {
217 if (gf_cpu_supports_intel_ssse3) {
222 #ifdef INTEL_SSE4_PCLMUL
223 if (gf_cpu_supports_intel_pclmul) {
229 if (gf_cpu_supports_arm_neon) {
230 pclmul = (w == 4 || w == 8);
236 if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
238 if (mult_type != GF_MULT_COMPOSITE && w < 64) {
239 if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; }
242 if (mult_type == GF_MULT_DEFAULT) {
243 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
244 if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
245 if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; }
249 if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
250 if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
251 if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
252 if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
254 if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&
255 mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
256 _gf_errno = GF_E_ARG1SET;
260 if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
261 _gf_errno = GF_E_ARG2SET;
265 if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
268 if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
269 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
270 if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
271 if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
272 if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
277 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
278 if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
279 if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
283 if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; }
285 if (mult_type == GF_MULT_SHIFT) {
286 if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
287 if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
291 if (mult_type == GF_MULT_CARRY_FREE) {
292 if (w != 4 && w != 8 && w != 16 &&
293 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
294 if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; }
295 if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; }
296 if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; }
297 if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
298 if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
299 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
300 if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
301 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
305 if (mult_type == GF_MULT_CARRY_FREE_GK) {
306 if (w != 4 && w != 8 && w != 16 &&
307 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
308 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
309 if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
310 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
314 if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
315 if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
316 if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
320 if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
321 || mult_type == GF_MULT_LOG_ZERO_EXT ) {
322 if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
323 if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
325 if (mult_type == GF_MULT_LOG_TABLE) return 1;
327 if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; }
329 if (mult_type == GF_MULT_LOG_ZERO) return 1;
331 if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; }
335 if (mult_type == GF_MULT_GROUP) {
336 if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; }
337 if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; }
338 if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; }
339 if (w == 128 && (arg1 != 4 ||
340 (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
341 if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
342 if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
343 if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
347 if (mult_type == GF_MULT_TABLE) {
348 if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
349 if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
350 if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
351 if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
355 if (mult_type == GF_MULT_SPLIT_TABLE) {
362 if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
363 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
364 if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
365 } else if (w == 16) {
366 if ((arg1 == 8 && arg2 == 8) ||
367 (arg1 == 8 && arg2 == 16)) {
368 if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
369 if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
370 } else if (arg1 == 4 && arg2 == 16) {
371 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
372 } else { _gf_errno = GF_E_SP_16AR; return 0; }
373 } else if (w == 32) {
374 if ((arg1 == 8 && arg2 == 8) ||
375 (arg1 == 8 && arg2 == 32) ||
376 (arg1 == 16 && arg2 == 32)) {
377 if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
378 if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
379 } else if (arg1 == 4 && arg2 == 32) {
380 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
381 if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
382 if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
383 } else { _gf_errno = GF_E_SP_32AR; return 0; }
384 } else if (w == 64) {
385 if ((arg1 == 8 && arg2 == 8) ||
386 (arg1 == 8 && arg2 == 64) ||
387 (arg1 == 16 && arg2 == 64)) {
388 if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
389 if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
390 } else if (arg1 == 4 && arg2 == 64) {
391 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
392 if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
393 if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
394 } else { _gf_errno = GF_E_SP_64AR; return 0; }
395 } else if (w == 128) {
396 if (arg1 == 8 && arg2 == 128) {
397 if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
398 if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
399 } else if (arg1 == 4 && arg2 == 128) {
400 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
401 if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
402 if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
403 } else { _gf_errno = GF_E_SP128AR; return 0; }
404 } else { _gf_errno = GF_E_SPLIT_W; return 0; }
408 if (mult_type == GF_MULT_COMPOSITE) {
409 if (w != 8 && w != 16 && w != 32
410 && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; }
411 if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
412 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
413 if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
414 if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
416 sub = (gf_internal_t *) base->scratch;
417 if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
419 if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
425 _gf_errno = GF_E_UNKNOWN;
429 int gf_scratch_size(int w,
436 if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
439 case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
440 case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
441 case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
442 case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
443 case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
444 case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
445 default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
449 extern int gf_size(gf_t *gf)
455 h = (gf_internal_t *) gf->scratch;
456 s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
457 if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
462 int gf_init_easy(gf_t *gf, int w)
464 return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
465 0, 0, 0, NULL, NULL);
468 /* Allen: What's going on here is this function is putting info into the
469 scratch mem of gf, and then calling the relevant REAL init
470 func for the word size. Probably done this way to consolidate
471 those aspects of initialization that don't rely on word size,
472 and then take care of word-size-specific stuff. */
474 int gf_init_hard(gf_t *gf, int w, int mult_type,
480 void *scratch_memory)
487 if (gf_error_check(w, mult_type, region_type, divide_type,
488 arg1, arg2, prim_poly, base_gf) == 0) return 0;
490 sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
491 if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught
492 in gf_error_check() */
494 if (scratch_memory == NULL) {
495 h = (gf_internal_t *) malloc(sz);
501 gf->scratch = (void *) h;
502 h->mult_type = mult_type;
503 h->region_type = region_type;
504 h->divide_type = divide_type;
506 h->prim_poly = prim_poly;
509 h->base_gf = base_gf;
510 h->private = (void *) gf->scratch;
511 h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
512 gf->extract_word.w32 = NULL;
515 case 4: return gf_w4_init(gf);
516 case 8: return gf_w8_init(gf);
517 case 16: return gf_w16_init(gf);
518 case 32: return gf_w32_init(gf);
519 case 64: return gf_w64_init(gf);
520 case 128: return gf_w128_init(gf);
521 default: return gf_wgen_init(gf);
525 int gf_free(gf_t *gf, int recursive)
529 h = (gf_internal_t *) gf->scratch;
530 if (recursive && h->base_gf != NULL) {
531 gf_free(h->base_gf, 1);
534 if (h->free_me) free(h);
535 return 0; /* Making compiler happy */
538 void gf_alignment_error(char *s, int a)
540 fprintf(stderr, "Alignment error in %s:\n", s);
541 fprintf(stderr, " The source and destination buffers must be aligned to each other,\n");
542 fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a);
547 void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
553 for (i = 0; i < rows; i++) inv[i] = (1 << i);
555 /* First -- convert into upper triangular */
557 for (i = 0; i < cols; i++) {
559 /* Swap rows if we ave a zero i,i element. If we can't swap, then the
560 matrix was not invertible */
562 if ((mat[i] & (1 << i)) == 0) {
563 for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
565 fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
568 tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
569 tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
572 /* Now for each j>i, add A_ji*Ai to Aj */
573 for (j = i+1; j != rows; j++) {
574 if ((mat[j] & (1 << i)) != 0) {
581 /* Now the matrix is upper triangular. Start at the top and multiply down */
583 for (i = rows-1; i >= 0; i--) {
584 for (j = 0; j < i; j++) {
585 if (mat[j] & (1 << i)) {
586 /* mat[j] ^= mat[i]; */
593 uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
595 uint32_t mat[32], inv[32], mask;
598 mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;
599 for (i = 0; i < w; i++) {
602 if (y & (1 << (w-1))) {
604 y = ((y ^ pp) & mask);
610 gf_invert_binary_matrix(mat, inv, w);
614 void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
618 uint64_t *s64, *d64, *top;
628 prod = base[a >> 48];
631 prod ^= base[a >> 48];
634 prod ^= base[a >> 48];
637 prod ^= base[a >> 48];
646 prod = base[a >> 48];
649 prod ^= base[a >> 48];
652 prod ^= base[a >> 48];
655 prod ^= base[a >> 48];
663 static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
677 while (src < s_top) {
680 s8 = (uint8_t *) src;
681 d8 = (uint8_t *) dest;
682 *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :
683 rd->gf->multiply.w32(rd->gf, rd->val, *s8);
686 s8 = (uint8_t *) src;
687 d8 = (uint8_t *) dest;
689 p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
690 p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
691 if (rd->xor) p ^= *d8;
695 s16 = (uint16_t *) src;
696 d16 = (uint16_t *) dest;
697 *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :
698 rd->gf->multiply.w32(rd->gf, rd->val, *s16);
701 s32 = (uint32_t *) src;
702 d32 = (uint32_t *) dest;
703 *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :
704 rd->gf->multiply.w32(rd->gf, rd->val, *s32);
707 s64 = (uint64_t *) src;
708 d64 = (uint64_t *) dest;
709 *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :
710 rd->gf->multiply.w64(rd->gf, rd->val, *s64);
713 fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
716 src = (uint8_t *)src + wb;
717 dest = (uint8_t *)dest + wb;
721 /* JSP - The purpose of this procedure is to error check alignment,
722 and to set up the region operation so that it can best leverage
725 It stores its information in rd.
727 Assuming you're not doing Cauchy coding, (see below for that),
728 then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
731 src and dest must then be aligned on ceil(w/8)-byte boundaries.
732 Moreover, bytes must be a multiple of ceil(w/8). If the variable
733 align is equal to ceil(w/8), then we will set s_start = src,
734 d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
735 And we return -- the implementation will go ahead and do the
736 multiplication on individual words (e.g. using discrete logs).
738 If align is greater than ceil(w/8), then the implementation needs
739 to work on groups of "align" bytes. For example, suppose you are
740 implementing BYTWO, without SSE. Then you will be doing the region
741 multiplication in units of 8 bytes, so align = 8. Or, suppose you
742 are doing a Quad table in GF(2^4). You will be doing the region
743 multiplication in units of 2 bytes, so align = 2. Or, suppose you
744 are doing split multiplication with SSE operations in GF(2^8).
745 Then align = 16. Worse yet, suppose you are doing split
746 multiplication with SSE operations in GF(2^16), with or without
747 ALTMAP. Then, you will be doing the multiplication on 256 bits at
748 a time. So align = 32.
750 When align does not equal ceil(w/8), we split the region
751 multiplication into three parts. We are going to make s_start be
752 the first address greater than or equal to src that is a multiple
753 of align. s_top is going to be the largest address >= src+bytes
754 such that (s_top - s_start) is a multiple of align. We do the
755 same with d_start and d_top. When we say that "src and dest must
756 be aligned with respect to each other, we mean that s_start-src
757 must equal d_start-dest.
759 Now, the region multiplication is done in three parts -- the part
760 between src and s_start must be done using single words.
761 Similarly, the part between s_top and src+bytes must also be done
762 using single words. The part between s_start and s_top will be
763 done in chunks of "align" bytes.
765 One final thing -- if align > 16, then s_start and d_start will be
766 aligned on a 16 byte boundary. Perhaps we should have two
767 variables: align and chunksize. Then we'd have s_start & d_start
768 aligned to "align", and have s_top-s_start be a multiple of
769 chunksize. That may be less confusing, but it would be a big
772 Finally, if align = -1, then we are doing Cauchy multiplication,
773 using only XOR's. In this case, we're not going to care about
774 alignment because we are just doing XOR's. Instead, the only
775 thing we care about is that bytes must be a multiple of w.
777 This is not to say that alignment doesn't matter in performance
778 with XOR's. See that discussion in gf_multby_one().
780 After you call gf_set_region_data(), the procedure
781 gf_do_initial_region_alignment() calls gf->multiply.w32() on
782 everything between src and s_start. The procedure
783 gf_do_final_region_alignment() calls gf->multiply.w32() on
784 everything between s_top and src+bytes.
787 void gf_set_region_data(gf_region_data *rd,
796 gf_internal_t *h = NULL;
799 unsigned long uls, uld;
801 if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */
817 uls = (unsigned long) src;
818 uld = (unsigned long) dest;
820 a = (align <= 16) ? align : 16;
822 if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers
823 so that there are no alignment regions. */
824 if (h != NULL && bytes % h->w != 0) {
825 fprintf(stderr, "Error in region multiply operation.\n");
826 fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
832 rd->s_top = (uint8_t *)src + bytes;
833 rd->d_top = (uint8_t *)src + bytes;
837 if (uls % a != uld % a) {
838 fprintf(stderr, "Error in region multiply operation.\n");
839 fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
840 fprintf(stderr, "to each other along a %d byte boundary.\n", a);
841 fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
842 (unsigned long) dest);
847 fprintf(stderr, "Error in region multiply operation.\n");
848 fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
849 fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
850 (unsigned long) dest);
854 if (bytes % wb != 0) {
855 fprintf(stderr, "Error in region multiply operation.\n");
856 fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
861 if (uls != 0) uls = (a-uls);
862 rd->s_start = (uint8_t *)rd->src + uls;
863 rd->d_start = (uint8_t *)rd->dest + uls;
865 bytes -= (bytes % align);
866 rd->s_top = (uint8_t *)rd->s_start + bytes;
867 rd->d_top = (uint8_t *)rd->d_start + bytes;
871 void gf_do_initial_region_alignment(gf_region_data *rd)
873 gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
876 void gf_do_final_region_alignment(gf_region_data *rd)
878 gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
881 void gf_multby_zero(void *dest, int bytes, int xor)
888 /* JSP - gf_multby_one tries to do this in the most efficient way
889 possible. If xor = 0, then simply call memcpy() since that
890 should be optimized by the system. Otherwise, try to do the xor
891 in the following order:
893 If src and dest are aligned with respect to each other on 16-byte
894 boundaries and you have SSE instructions, then use aligned SSE
897 If they aren't but you still have SSE instructions, use unaligned
900 If there are no SSE instructions, but they are aligned with
901 respect to each other on 8-byte boundaries, then do them with
904 Otherwise, call gf_unaligned_xor(), which does the following:
905 align a destination pointer along an 8-byte boundary, and then
906 memcpy 32 bytes at a time from the src pointer to an array of
907 doubles. I'm not sure if that's the best -- probably needs
908 testing, but this seems like it could be a black hole.
911 static void gf_unaligned_xor(void *src, void *dest, int bytes);
913 void gf_multby_one(void *src, void *dest, int bytes, int xor)
915 unsigned long uls, uld;
917 uint64_t *s64, *d64, *dtop64;
922 memcpy(dest, src, bytes);
925 uls = (unsigned long) src;
926 uld = (unsigned long) dest;
929 if (gf_cpu_supports_intel_sse2) {
932 s8 = (uint8_t *) src;
933 d8 = (uint8_t *) dest;
934 if (uls % 16 == uld % 16) {
935 gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
936 while (s8 != rd.s_start) {
941 while (s8 < (uint8_t *) rd.s_top) {
942 ms = _mm_load_si128 ((__m128i *)(s8));
943 md = _mm_load_si128 ((__m128i *)(d8));
944 md = _mm_xor_si128(md, ms);
945 _mm_store_si128((__m128i *)(d8), md);
949 while (s8 != (uint8_t *) src + bytes) {
957 abytes = (bytes & 0xfffffff0);
959 while (d8 < (uint8_t *) dest + abytes) {
960 ms = _mm_loadu_si128 ((__m128i *)(s8));
961 md = _mm_loadu_si128 ((__m128i *)(d8));
962 md = _mm_xor_si128(md, ms);
963 _mm_storeu_si128((__m128i *)(d8), md);
967 while (d8 != (uint8_t *) dest+bytes) {
975 #if defined(ARM_NEON)
976 if (gf_cpu_supports_arm_neon) {
977 s8 = (uint8_t *) src;
978 d8 = (uint8_t *) dest;
980 if (uls % 16 == uld % 16) {
981 gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
982 while (s8 != rd.s_start) {
987 while (s8 < (uint8_t *) rd.s_top) {
988 uint8x16_t vs = vld1q_u8 (s8);
989 uint8x16_t vd = vld1q_u8 (d8);
990 uint8x16_t vr = veorq_u8 (vs, vd);
996 while (s8 + 15 < (uint8_t *) src + bytes) {
997 uint8x16_t vs = vld1q_u8 (s8);
998 uint8x16_t vd = vld1q_u8 (d8);
999 uint8x16_t vr = veorq_u8 (vs, vd);
1005 while (s8 < (uint8_t *) src + bytes) {
1013 if (uls % 8 != uld % 8) {
1014 gf_unaligned_xor(src, dest, bytes);
1018 gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
1019 s8 = (uint8_t *) src;
1020 d8 = (uint8_t *) dest;
1021 while (d8 != rd.d_start) {
1026 dtop64 = (uint64_t *) rd.d_top;
1028 d64 = (uint64_t *) rd.d_start;
1029 s64 = (uint64_t *) rd.s_start;
1031 while (d64 < dtop64) {
1037 s8 = (uint8_t *) rd.s_top;
1038 d8 = (uint8_t *) rd.d_top;
1040 while (d8 != (uint8_t *) dest+bytes) {
1048 #define UNALIGNED_BUFSIZE (8)
1050 static void gf_unaligned_xor(void *src, void *dest, int bytes)
1052 uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
1057 /* JSP - call gf_set_region_data(), but use dest in both places. This is
1058 because I only want to set up dest. If I used src, gf_set_region_data()
1059 would fail because src and dest are not aligned to each other wrt
1060 8-byte pointers. I know this will actually align d_start to 16 bytes.
1061 If I change gf_set_region_data() to split alignment & chunksize, then
1062 I could do this correctly. */
1064 gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
1065 s8 = (uint8_t *) src;
1066 d8 = (uint8_t *) dest;
1068 while (d8 < (uint8_t *) rd.d_start) {
1074 d64 = (uint64_t *) d8;
1075 while (d64 < (uint64_t *) rd.d_top) {
1076 memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
1077 s8 += 8*UNALIGNED_BUFSIZE;
1078 for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
1084 d8 = (uint8_t *) d64;
1085 while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {