2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
6 * Copyright (c) 2014: Janne Grunau <j@jannau.net>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
20 * - Neither the name of the University of Tennessee nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
31 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
39 * Neon routines for 64-bit Galois fields
50 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
51 vtbl2_u8(tbl, vget_high_u8(v)))
57 neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
58 uint64_t *dst, uint64_t *d_end,
59 uint64_t val, int xor)
64 uint8x16_t tables[16][8];
66 uint8x8x2_t tables[16][8];
68 uint8x16_t p[8], mask1, si;
70 gf_internal_t *h = (gf_internal_t *) gf->scratch;
71 struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
73 for (i = 0; i < 16; i++) {
74 for (j = 0; j < 8; j++) {
75 for (k = 0; k < 16; k++) {
76 btable[k] = (uint8_t) ld->tables[i][k];
77 ld->tables[i][k] >>= 8;
80 tables[i][j] = vld1q_u8(btable);
82 tables[i][j].val[0] = vld1_u8(btable);
83 tables[i][j].val[1] = vld1_u8(btable + 8);
88 mask1 = vdupq_n_u8(0xf);
93 for (i = 0; i < 8; i++)
94 p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
96 for (i = 0; i < 8; i++)
101 for (k = 0; k < 8; k++) {
102 uint8x16_t v0 = vld1q_u8((uint8_t *) src);
105 si = vandq_u8(v0, mask1);
106 for (j = 0; j < 8; j++) {
107 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
110 si = vshrq_n_u8(v0, 4);
111 for (j = 0; j < 8; j++) {
112 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
117 for (i = 0; i < 8; i++) {
118 vst1q_u8((uint8_t *) dst, p[i]);
127 neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
128 uint64_t *d_end, uint64_t val, int xor)
133 uint8x16_t tables[16][8];
135 uint8x8x2_t tables[16][8];
137 uint8x16_t p[8], mask1, si;
143 gf_internal_t *h = (gf_internal_t *) gf->scratch;
144 struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
146 for (i = 0; i < 16; i++) {
147 for (j = 0; j < 8; j++) {
148 for (k = 0; k < 16; k++) {
149 btable[k] = (uint8_t) ld->tables[i][k];
150 ld->tables[i][k] >>= 8;
153 tables[i][j] = vld1q_u8(btable);
155 tables[i][j].val[0] = vld1_u8(btable);
156 tables[i][j].val[1] = vld1_u8(btable + 8);
161 mask1 = vdupq_n_u8(0xf);
163 while (dst < d_end) {
165 for (k = 0; k < 8; k++) {
166 st[k] = vld1q_u64(src);
168 p[k] = vdupq_n_u8(0);
171 s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
172 vreinterpretq_u32_u64(st[1]));
173 s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
174 vreinterpretq_u32_u64(st[3]));
175 s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
176 vreinterpretq_u32_u64(st[5]));
177 s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
178 vreinterpretq_u32_u64(st[7]));
180 s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
181 vreinterpretq_u16_u32(s32[1].val[0]));
182 s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
183 vreinterpretq_u16_u32(s32[3].val[0]));
184 s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
185 vreinterpretq_u16_u32(s32[1].val[1]));
186 s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
187 vreinterpretq_u16_u32(s32[3].val[1]));
189 s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
190 vreinterpretq_u8_u16(s16[1].val[0]));
191 s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
192 vreinterpretq_u8_u16(s16[1].val[1]));
193 s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
194 vreinterpretq_u8_u16(s16[3].val[0]));
195 s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
196 vreinterpretq_u8_u16(s16[3].val[1]));
199 for (k = 0; k < 8; k++) {
200 si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
201 for (j = 0; j < 8; j++) {
202 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
205 si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
206 for (j = 0; j < 8; j++) {
207 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
212 s8[0] = vzipq_u8(p[0], p[1]);
213 s8[1] = vzipq_u8(p[2], p[3]);
214 s8[2] = vzipq_u8(p[4], p[5]);
215 s8[3] = vzipq_u8(p[6], p[7]);
217 s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
218 vreinterpretq_u16_u8(s8[1].val[0]));
219 s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
220 vreinterpretq_u16_u8(s8[3].val[0]));
221 s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
222 vreinterpretq_u16_u8(s8[1].val[1]));
223 s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
224 vreinterpretq_u16_u8(s8[3].val[1]));
226 s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
227 vreinterpretq_u32_u16(s16[1].val[0]));
228 s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
229 vreinterpretq_u32_u16(s16[1].val[1]));
230 s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
231 vreinterpretq_u32_u16(s16[3].val[0]));
232 s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
233 vreinterpretq_u32_u16(s16[3].val[1]));
235 for (k = 0; k < 8; k ++) {
236 st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
240 for (i = 0; i < 8; i++) {
241 uint64x2_t t1 = vld1q_u64(dst);
242 vst1q_u64(dst, veorq_u64(st[i], t1));
246 for (i = 0; i < 8; i++) {
247 vst1q_u64(dst, st[i]);
257 gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
258 uint64_t val, int bytes, int xor,
263 uint64_t pp, v, *s64, *d64, *top;
264 struct gf_split_4_64_lazy_data *ld;
267 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
268 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
270 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
271 gf_do_initial_region_alignment(&rd);
273 s64 = (uint64_t *) rd.s_start;
274 d64 = (uint64_t *) rd.d_start;
275 top = (uint64_t *) rd.d_top;
277 h = (gf_internal_t *) gf->scratch;
279 ld = (struct gf_split_4_64_lazy_data *) h->private;
282 for (i = 0; i < 16; i++) {
283 ld->tables[i][0] = 0;
284 for (j = 1; j < 16; j <<= 1) {
285 for (k = 0; k < j; k++) {
286 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
288 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
294 neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
296 neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
299 neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
301 neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
304 gf_do_final_region_alignment(&rd);
309 gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
310 uint64_t val, int bytes, int xor)
312 gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
317 gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
318 void *dest, uint64_t val,
321 gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
324 void gf_w64_neon_split_init(gf_t *gf)
326 gf_internal_t *h = (gf_internal_t *) gf->scratch;
328 if (h->region_type & GF_REGION_ALTMAP)
329 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_altmap_multiply_region_neon)
331 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region_neon)