2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
6 * Copyright (c) 2014: Janne Grunau <j@jannau.net>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
20 * - Neither the name of the University of Tennessee nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
31 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
40 * Neon routines for 16-bit Galois fields
50 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
51 vtbl2_u8(tbl, vget_high_u8(v)))
57 neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
58 uint16_t *d_end, uint8_t *tbl,
59 gf_val_32_t val, int xor)
62 uint8_t *high = tbl + 4 * 16;
63 uint8x16_t loset, rl, rh;
67 uint8x16_t tbl_h[4], tbl_l[4];
68 for (i = 0; i < 4; i++) {
69 tbl_l[i] = vld1q_u8(tbl + i*16);
70 tbl_h[i] = vld1q_u8(high + i*16);
73 uint8x8x2_t tbl_h[4], tbl_l[4];
74 for (i = 0; i < 4; i++) {
75 tbl_l[i].val[0] = vld1_u8(tbl + i*16);
76 tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
77 tbl_h[i].val[0] = vld1_u8(high + i*16);
78 tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
82 loset = vdupq_n_u8(0xf);
87 va = vld2q_u8((uint8_t*)src);
88 vb = vld2q_u8((uint8_t*)dst);
90 rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
91 rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
92 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
93 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
95 va.val[0] = vshrq_n_u8(va.val[0], 4);
96 va.val[1] = vshrq_n_u8(va.val[1], 4);
98 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
99 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
100 va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
101 va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
103 va.val[0] = veorq_u8(va.val[0], vb.val[0]);
104 va.val[1] = veorq_u8(va.val[1], vb.val[1]);
105 vst2q_u8((uint8_t*)dst, va);
111 while (dst < d_end) {
112 va = vld2q_u8((uint8_t*)src);
114 rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
115 rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
116 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
117 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
119 va.val[0] = vshrq_n_u8(va.val[0], 4);
120 va.val[1] = vshrq_n_u8(va.val[1], 4);
122 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
123 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
124 va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
125 va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
127 vst2q_u8((uint8_t*)dst, va);
138 neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
139 uint8_t *dst, uint8_t *d_end,
140 uint8_t *tbl, gf_val_32_t val,
144 uint8_t *high = tbl + 4 * 16;
145 uint8x16_t vh, vl, rh, rl;
149 uint8x16_t tbl_h[4], tbl_l[4];
151 uint8x8x2_t tbl_h[4], tbl_l[4];
153 for (i = 0; i < 4; i++) {
155 tbl_l[i] = vld1q_u8(tbl + i*16);
156 tbl_h[i] = vld1q_u8(high + i*16);
158 tbl_l[i].val[0] = vld1_u8(tbl + i*16);
159 tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
160 tbl_h[i].val[0] = vld1_u8(high + i*16);
161 tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
165 loset = vdupq_n_u8(0xf);
167 while (dst < d_end) {
169 vl = vld1q_u8(src + 16);
171 rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
172 rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
173 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
174 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
176 vl = vshrq_n_u8(vl, 4);
177 vh = vshrq_n_u8(vh, 4);
179 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
180 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
181 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
182 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
186 vl = vld1q_u8(dst + 16);
187 rh = veorq_u8(rh, vh);
188 rl = veorq_u8(rl, vl);
191 vst1q_u8(dst + 16, rl);
203 neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
204 gf_val_32_t val, int bytes, int xor,
210 uint8_t tbl[2 * 4 * 16];
211 uint8_t *high = tbl + 4 * 16;
213 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
214 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
216 for (i = 0; i < 4; i++) {
217 for (j = 0; j < 16; j++) {
219 prod = gf->multiply.w32(gf, c, val);
220 tbl[i*16 + j] = prod & 0xff;
221 high[i*16 + j] = prod >> 8;
225 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
226 gf_do_initial_region_alignment(&rd);
229 uint8_t *s8 = rd.s_start;
230 uint8_t *d8 = rd.d_start;
231 uint8_t *end8 = rd.d_top;
233 neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
235 neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
237 uint16_t *s16 = rd.s_start;
238 uint16_t *d16 = rd.d_start;
239 uint16_t *end16 = rd.d_top;
241 neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
243 neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
246 gf_do_final_region_alignment(&rd);
251 gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
252 gf_val_32_t val, int bytes, int xor)
254 neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
259 gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
261 gf_val_32_t val, int bytes,
264 neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
268 void gf_w16_neon_split_init(gf_t *gf)
270 gf_internal_t *h = (gf_internal_t *) gf->scratch;
272 if (h->region_type & GF_REGION_ALTMAP)
273 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_altmap_multiply_region_neon)
275 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region_neon)