IDA_new/gf-complete/src/neon/gf_w4_neon.c

   1 /*
   2  * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
   3  * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
   4  * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
   5  *
   6  * Copyright (c) 2014: Janne Grunau <j@jannau.net>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  *  - Redistributions of source code must retain the above copyright
  13  *     notice, this list of conditions and the following disclaimer.
  14  *
  15  *  - Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  *
  20  *  - Neither the name of the University of Tennessee nor the names of its
  21  *    contributors may be used to endorse or promote products derived
  22  *    from this software without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35  * POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  * gf_w4_neon.c
  38  *
  39  * Neon routines for 4-bit Galois fields
  40  *
  41  */
  42
  43 #include "gf_int.h"
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include "gf_w4.h"
  47
  48 static
  49 gf_val_32_t
  50 gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
  51 {
  52   gf_val_32_t rv = 0;
  53   poly8x8_t       result, prim_poly;
  54   poly8x8_t       a, b, w;
  55   uint8x8_t       v;
  56   gf_internal_t * h = gf->scratch;
  57
  58   a =  vdup_n_p8 (a4);
  59   b =  vdup_n_p8 (b4);
  60
  61   prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
  62
  63   /* Do the initial multiply */
  64   result = vmul_p8 (a, b);
  65   v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
  66   w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
  67   result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
  68
  69   /* Extracts 32 bit value from result. */
  70   rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
  71
  72   return rv;
  73 }
  74
  75 static inline void
  76 neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
  77                                       gf_val_32_t val, uint8_t *d_end, int xor)
  78 {
  79   gf_internal_t * h = gf->scratch;
  80   poly8x8_t       prim_poly;
  81   poly8x8_t       a, w, even, odd;
  82   uint8x8_t       b, c, v, mask;
  83
  84   a         = vdup_n_p8 (val);
  85   mask      = vdup_n_u8 (0xf);
  86   prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
  87
  88   while (d8 < d_end) {
  89     b = vld1_u8 (s8);
  90
  91     even = vreinterpret_p8_u8 (vand_u8 (b, mask));
  92     odd  = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
  93
  94     if (xor)
  95         c = vld1_u8 (d8);
  96
  97     even = vmul_p8 (a, even);
  98     odd  = vmul_p8 (a, odd);
  99
 100     v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
 101     w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
 102     even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
 103
 104     v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
 105     w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
 106     odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
 107
 108     v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
 109
 110     if (xor)
 111       v = veor_u8 (c, v);
 112
 113     vst1_u8 (d8, v);
 114
 115     d8 += 8;
 116     s8 += 8;
 117   }
 118 }
 119
 120
 121 static void
 122 gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
 123                                             gf_val_32_t val, int bytes, int xor)
 124 {
 125   gf_region_data rd;
 126   uint8_t *s8;
 127   uint8_t *d8;
 128
 129   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
 130   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 131
 132   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
 133   gf_do_initial_region_alignment(&rd);
 134
 135   s8 = (uint8_t *) rd.s_start;
 136   d8 = (uint8_t *) rd.d_start;
 137
 138   if (xor)
 139     neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
 140   else
 141     neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
 142
 143   gf_do_final_region_alignment(&rd);
 144 }
 145
 146 #ifndef ARCH_AARCH64
 147 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
 148                                        vtbl2_u8(tbl, vget_high_u8(v)))
 149 #endif
 150
 151 static
 152 inline
 153 void
 154 w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
 155                                      uint8_t * d_end, gf_val_32_t val, int xor)
 156 {
 157   struct gf_single_table_data *std;
 158   uint8_t *base;
 159   uint8x16_t r, va, vh, vl, loset;
 160
 161 #ifdef ARCH_AARCH64
 162   uint8x16_t th, tl;
 163 #else
 164   uint8x8x2_t th, tl;
 165 #endif
 166
 167   std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
 168   base = (uint8_t *) std->mult;
 169   base += (val << GF_FIELD_WIDTH);
 170
 171 #ifdef ARCH_AARCH64
 172   tl = vld1q_u8 (base);
 173   th = vshlq_n_u8 (tl, 4);
 174 #else
 175   tl.val[0] = vld1_u8 (base);
 176   tl.val[1] = vld1_u8 (base + 8);
 177   th.val[0] =  vshl_n_u8 (tl.val[0], 4);
 178   th.val[1] =  vshl_n_u8 (tl.val[1], 4);
 179 #endif
 180
 181   loset = vdupq_n_u8(0xf);
 182
 183   while (dst < d_end) {
 184       va = vld1q_u8 (src);
 185
 186       vh = vshrq_n_u8 (va, 4);
 187       vl = vandq_u8 (va, loset);
 188
 189       if (xor)
 190         va = vld1q_u8 (dst);
 191
 192       vh = vqtbl1q_u8 (th, vh);
 193       vl = vqtbl1q_u8 (tl, vl);
 194
 195       r = veorq_u8 (vh, vl);
 196
 197       if (xor)
 198         r = veorq_u8 (va, r);
 199
 200       vst1q_u8 (dst, r);
 201
 202     dst += 16;
 203     src += 16;
 204   }
 205 }
 206
 207 static
 208 void
 209 gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
 210                                         gf_val_32_t val, int bytes, int xor)
 211 {
 212   gf_region_data rd;
 213   uint8_t *sptr, *dptr, *top;
 214
 215   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
 216   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 217
 218   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
 219   gf_do_initial_region_alignment(&rd);
 220
 221   sptr = rd.s_start;
 222   dptr = rd.d_start;
 223   top  = rd.d_top;
 224
 225   if (xor)
 226       w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
 227   else
 228       w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
 229
 230   gf_do_final_region_alignment(&rd);
 231
 232 }
 233
 234
 235 int gf_w4_neon_cfm_init(gf_t *gf)
 236 {
 237   // single clm multiplication probably pointless
 238   SET_FUNCTION(gf,multiply,w32,gf_w4_neon_clm_multiply)
 239   SET_FUNCTION(gf,multiply_region,w32,gf_w4_neon_clm_multiply_region_from_single)
 240
 241   return 1;
 242 }
 243
 244 void gf_w4_neon_single_table_init(gf_t *gf)
 245 {
 246   SET_FUNCTION(gf,multiply_region,w32,gf_w4_single_table_multiply_region_neon)
 247 }