IDA_new/gf-complete/src/neon/gf_w16_neon.c

   1 /*
   2  * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
   3  * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
   4  * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
   5  *
   6  * Copyright (c) 2014: Janne Grunau <j@jannau.net>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  *  - Redistributions of source code must retain the above copyright
  13  *     notice, this list of conditions and the following disclaimer.
  14  *
  15  *  - Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  *
  20  *  - Neither the name of the University of Tennessee nor the names of its
  21  *    contributors may be used to endorse or promote products derived
  22  *    from this software without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35  * POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  *
  38  * gf_w16_neon.c
  39  *
  40  * Neon routines for 16-bit Galois fields
  41  *
  42  */
  43
  44 #include "gf_int.h"
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include "gf_w16.h"
  48
  49 #ifndef ARCH_AARCH64
  50 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
  51                                        vtbl2_u8(tbl, vget_high_u8(v)))
  52 #endif
  53
  54 static
  55 inline
  56 void
  57 neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
  58                                  uint16_t *d_end, uint8_t *tbl,
  59                                  gf_val_32_t val, int xor)
  60 {
  61   unsigned i;
  62   uint8_t *high = tbl + 4 * 16;
  63   uint8x16_t loset, rl, rh;
  64   uint8x16x2_t va;
  65
  66 #ifdef ARCH_AARCH64
  67   uint8x16_t tbl_h[4], tbl_l[4];
  68   for (i = 0; i < 4; i++) {
  69       tbl_l[i] = vld1q_u8(tbl + i*16);
  70       tbl_h[i] = vld1q_u8(high + i*16);
  71   }
  72 #else
  73   uint8x8x2_t tbl_h[4], tbl_l[4];
  74   for (i = 0; i < 4; i++) {
  75       tbl_l[i].val[0] = vld1_u8(tbl + i*16);
  76       tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
  77       tbl_h[i].val[0] = vld1_u8(high + i*16);
  78       tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
  79   }
  80 #endif
  81
  82   loset = vdupq_n_u8(0xf);
  83
  84   if (xor) {
  85     uint8x16x2_t vb;
  86     while (dst < d_end) {
  87       va = vld2q_u8((uint8_t*)src);
  88       vb = vld2q_u8((uint8_t*)dst);
  89
  90       rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
  91       rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
  92       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
  93       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
  94
  95       va.val[0] = vshrq_n_u8(va.val[0], 4);
  96       va.val[1] = vshrq_n_u8(va.val[1], 4);
  97
  98       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
  99       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
 100       va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
 101       va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
 102
 103       va.val[0] = veorq_u8(va.val[0], vb.val[0]);
 104       va.val[1] = veorq_u8(va.val[1], vb.val[1]);
 105       vst2q_u8((uint8_t*)dst, va);
 106
 107       src += 16;
 108       dst += 16;
 109     }
 110   } else {
 111     while (dst < d_end) {
 112       va = vld2q_u8((uint8_t*)src);
 113
 114       rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
 115       rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
 116       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
 117       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
 118
 119       va.val[0] = vshrq_n_u8(va.val[0], 4);
 120       va.val[1] = vshrq_n_u8(va.val[1], 4);
 121
 122       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
 123       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
 124       va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
 125       va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
 126
 127       vst2q_u8((uint8_t*)dst, va);
 128
 129       src += 16;
 130       dst += 16;
 131     }
 132   }
 133 }
 134
 135 static
 136 inline
 137 void
 138 neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
 139                                         uint8_t *dst, uint8_t *d_end,
 140                                         uint8_t *tbl, gf_val_32_t val,
 141                                         int xor)
 142 {
 143   unsigned i;
 144   uint8_t *high = tbl + 4 * 16;
 145   uint8x16_t vh, vl, rh, rl;
 146   uint8x16_t loset;
 147
 148 #ifdef ARCH_AARCH64
 149   uint8x16_t tbl_h[4], tbl_l[4];
 150 #else
 151   uint8x8x2_t tbl_h[4], tbl_l[4];
 152 #endif
 153   for (i = 0; i < 4; i++) {
 154 #ifdef ARCH_AARCH64
 155       tbl_l[i] = vld1q_u8(tbl + i*16);
 156       tbl_h[i] = vld1q_u8(high + i*16);
 157 #else
 158       tbl_l[i].val[0] = vld1_u8(tbl + i*16);
 159       tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
 160       tbl_h[i].val[0] = vld1_u8(high + i*16);
 161       tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
 162 #endif
 163   }
 164
 165   loset = vdupq_n_u8(0xf);
 166
 167   while (dst < d_end) {
 168       vh = vld1q_u8(src);
 169       vl = vld1q_u8(src + 16);
 170
 171       rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
 172       rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
 173       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
 174       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
 175
 176       vl = vshrq_n_u8(vl, 4);
 177       vh = vshrq_n_u8(vh, 4);
 178
 179       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
 180       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
 181       rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
 182       rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
 183
 184       if (xor) {
 185           vh = vld1q_u8(dst);
 186           vl = vld1q_u8(dst + 16);
 187           rh = veorq_u8(rh, vh);
 188           rl = veorq_u8(rl, vl);
 189       }
 190       vst1q_u8(dst, rh);
 191       vst1q_u8(dst + 16, rl);
 192
 193       src += 32;
 194       dst += 32;
 195   }
 196 }
 197
 198
 199
 200 static
 201 inline
 202 void
 203 neon_w16_split_4_16_lazy_multiply_region(gf_t *gf, void *src, void *dest,
 204                                          gf_val_32_t val, int bytes, int xor,
 205                                          int altmap)
 206 {
 207   gf_region_data rd;
 208   unsigned i, j;
 209   uint64_t c, prod;
 210   uint8_t tbl[2 * 4 * 16];
 211   uint8_t *high = tbl + 4 * 16;
 212
 213   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
 214   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 215
 216   for (i = 0; i < 4; i++) {
 217     for (j = 0; j < 16; j++) {
 218       c = (j << (i*4));
 219       prod = gf->multiply.w32(gf, c, val);
 220       tbl[i*16 + j]  = prod & 0xff;
 221       high[i*16 + j] = prod >> 8;
 222     }
 223   }
 224
 225   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
 226   gf_do_initial_region_alignment(&rd);
 227
 228   if (altmap) {
 229     uint8_t *s8   = rd.s_start;
 230     uint8_t *d8   = rd.d_start;
 231     uint8_t *end8 = rd.d_top;
 232     if (xor)
 233       neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 1);
 234     else
 235       neon_w16_split_4_altmap_multiply_region(gf, s8, d8, end8, tbl, val, 0);
 236   } else {
 237     uint16_t *s16   = rd.s_start;
 238     uint16_t *d16   = rd.d_start;
 239     uint16_t *end16 = rd.d_top;
 240     if (xor)
 241       neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 1);
 242     else
 243       neon_w16_split_4_multiply_region(gf, s16, d16, end16, tbl, val, 0);
 244   }
 245
 246   gf_do_final_region_alignment(&rd);
 247 }
 248
 249 static
 250 void
 251 gf_w16_split_4_16_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
 252                                             gf_val_32_t val, int bytes, int xor)
 253 {
 254   neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
 255 }
 256
 257 static
 258 void
 259 gf_w16_split_4_16_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
 260                                                    void *dest,
 261                                                    gf_val_32_t val, int bytes,
 262                                                    int xor)
 263 {
 264   neon_w16_split_4_16_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
 265 }
 266
 267
 268 void gf_w16_neon_split_init(gf_t *gf)
 269 {
 270   gf_internal_t *h = (gf_internal_t *) gf->scratch;
 271
 272   if (h->region_type & GF_REGION_ALTMAP)
 273     SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_altmap_multiply_region_neon)
 274   else
 275     SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region_neon)
 276 }