2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
6 * Copyright (c) 2014: Janne Grunau <j@jannau.net>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
20 * - Neither the name of the University of Tennessee nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
31 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
39 * Neon routines for 4-bit Galois fields
50 gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
53 poly8x8_t result, prim_poly;
56 gf_internal_t * h = gf->scratch;
61 prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
63 /* Do the initial multiply */
64 result = vmul_p8 (a, b);
65 v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
66 w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
67 result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
69 /* Extracts 32 bit value from result. */
70 rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
76 neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
77 gf_val_32_t val, uint8_t *d_end, int xor)
79 gf_internal_t * h = gf->scratch;
81 poly8x8_t a, w, even, odd;
82 uint8x8_t b, c, v, mask;
85 mask = vdup_n_u8 (0xf);
86 prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
91 even = vreinterpret_p8_u8 (vand_u8 (b, mask));
92 odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
97 even = vmul_p8 (a, even);
98 odd = vmul_p8 (a, odd);
100 v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
101 w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
102 even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
104 v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
105 w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
106 odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
108 v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
122 gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
123 gf_val_32_t val, int bytes, int xor)
129 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
130 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
132 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
133 gf_do_initial_region_alignment(&rd);
135 s8 = (uint8_t *) rd.s_start;
136 d8 = (uint8_t *) rd.d_start;
139 neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
141 neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
143 gf_do_final_region_alignment(&rd);
147 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
148 vtbl2_u8(tbl, vget_high_u8(v)))
154 w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
155 uint8_t * d_end, gf_val_32_t val, int xor)
157 struct gf_single_table_data *std;
159 uint8x16_t r, va, vh, vl, loset;
167 std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
168 base = (uint8_t *) std->mult;
169 base += (val << GF_FIELD_WIDTH);
172 tl = vld1q_u8 (base);
173 th = vshlq_n_u8 (tl, 4);
175 tl.val[0] = vld1_u8 (base);
176 tl.val[1] = vld1_u8 (base + 8);
177 th.val[0] = vshl_n_u8 (tl.val[0], 4);
178 th.val[1] = vshl_n_u8 (tl.val[1], 4);
181 loset = vdupq_n_u8(0xf);
183 while (dst < d_end) {
186 vh = vshrq_n_u8 (va, 4);
187 vl = vandq_u8 (va, loset);
192 vh = vqtbl1q_u8 (th, vh);
193 vl = vqtbl1q_u8 (tl, vl);
195 r = veorq_u8 (vh, vl);
198 r = veorq_u8 (va, r);
209 gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
210 gf_val_32_t val, int bytes, int xor)
213 uint8_t *sptr, *dptr, *top;
215 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
216 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
218 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
219 gf_do_initial_region_alignment(&rd);
226 w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
228 w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
230 gf_do_final_region_alignment(&rd);
235 int gf_w4_neon_cfm_init(gf_t *gf)
237 // single clm multiplication probably pointless
238 SET_FUNCTION(gf,multiply,w32,gf_w4_neon_clm_multiply)
239 SET_FUNCTION(gf,multiply_region,w32,gf_w4_neon_clm_multiply_region_from_single)
244 void gf_w4_neon_single_table_init(gf_t *gf)
246 SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region_neon)