IDA_new/gf-complete/src/neon/gf_w8_neon.c

   1 /*
   2  * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
   3  * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
   4  * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
   5  *
   6  * Copyright (c) 2014: Janne Grunau <j@jannau.net>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  *  - Redistributions of source code must retain the above copyright
  13  *     notice, this list of conditions and the following disclaimer.
  14  *
  15  *  - Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in
  17  *    the documentation and/or other materials provided with the
  18  *    distribution.
  19  *
  20  *  - Neither the name of the University of Tennessee nor the names of its
  21  *    contributors may be used to endorse or promote products derived
  22  *    from this software without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35  * POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  * gf_w8_neon.c
  38  *
  39  * Neon optimized routines for 8-bit Galois fields
  40  *
  41  */
  42
  43 #include "gf_int.h"
  44 #include "gf_w8.h"
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47
  48 /* ARM NEON reducing macro for the carry free multiplication
  49  *   vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
  50  *   the result to the right by 1 byte. This allows us to multiply
  51  *   the prim_poly by the leading bits of the result. We then xor the result
  52  *   of that operation back with the result. */
  53 #define NEON_CFM_REDUCE(v, w, result, prim_poly, initial)               \
  54   do {                                                                  \
  55     if (initial)                                                        \
  56       v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8);               \
  57     else                                                                \
  58       v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8));  \
  59     w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v));                    \
  60     result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
  61   } while (0)
  62
  63 static
  64 inline
  65 gf_val_32_t
  66 gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
  67 {
  68   gf_val_32_t rv = 0;
  69   poly8x8_t       a, b;
  70   uint8x8_t       v;
  71   poly16x8_t      result;
  72   poly8x8_t       prim_poly;
  73   poly16x8_t      w;
  74   gf_internal_t * h = gf->scratch;
  75
  76   a =  vdup_n_p8 (a8);
  77   b =  vdup_n_p8 (b8);
  78
  79   prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
  80
  81   /* Do the initial multiply */
  82   result = vmull_p8 (a, b);
  83
  84   /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
  85      have to do the reduction at most twice, because (w-2)/z == 2. Where
  86      z is equal to the number of zeros after the leading 1 */
  87   NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
  88   NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  89   if (x >= 3) {
  90     NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  91   }
  92   if (x >= 4) {
  93     NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  94   }
  95   /* Extracts 32 bit value from result. */
  96   rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
  97
  98   return rv;
  99 }
 100
 101 #define CLM_MULTIPLY(x) \
 102 static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
 103 {\
 104     return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
 105 }
 106
 107 CLM_MULTIPLY(2)
 108 CLM_MULTIPLY(3)
 109 CLM_MULTIPLY(4)
 110
 111 static inline void
 112 neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
 113                                        gf_val_32_t val, uint8_t *d_end,
 114                                        int xor, int x)
 115 {
 116   gf_internal_t * h = gf->scratch;
 117   poly8x8_t       a, b;
 118   uint8x8_t       c, v;
 119   poly16x8_t      result;
 120   poly8x8_t       prim_poly;
 121   poly16x8_t      w;
 122
 123   a         = vdup_n_p8 (val);
 124   prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
 125
 126   while (d8 < d_end) {
 127     b = vld1_p8 ((poly8_t *) s8);
 128
 129     if (xor)
 130         c = vld1_u8 (d8);
 131
 132     result = vmull_p8 (a, b);
 133
 134     NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
 135     NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
 136     if (x >= 3) {
 137       NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
 138     }
 139     if (x >= 4) {
 140       NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
 141     }
 142     v = vmovn_u16 (vreinterpretq_u16_p16 (result));
 143     if (xor)
 144       v = veor_u8 (c, v);
 145
 146     vst1_u8 (d8, v);
 147
 148     d8 += 8;
 149     s8 += 8;
 150   }
 151 }
 152
 153 #define CLM_MULT_REGION(x)                                              \
 154 static void                                                             \
 155 gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src,  \
 156                                                   void *dest,           \
 157                                                   gf_val_32_t val, int bytes, \
 158                                                   int xor)              \
 159 {                                                                       \
 160   gf_region_data rd;                                                    \
 161   uint8_t *s8;                                                          \
 162   uint8_t *d8;                                                          \
 163                                                                         \
 164   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }           \
 165   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }       \
 166                                                                         \
 167   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);          \
 168   gf_do_initial_region_alignment(&rd);                                  \
 169   s8 = (uint8_t *) rd.s_start;                                          \
 170   d8 = (uint8_t *) rd.d_start;                                          \
 171                                                                         \
 172   if (xor)                                                              \
 173     neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
 174   else                                                                  \
 175     neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
 176   gf_do_final_region_alignment(&rd);                                    \
 177 }
 178
 179 CLM_MULT_REGION(2)
 180 CLM_MULT_REGION(3)
 181 CLM_MULT_REGION(4)
 182
 183
 184 int gf_w8_neon_cfm_init(gf_t *gf)
 185 {
 186   gf_internal_t *h;
 187
 188   h = (gf_internal_t *) gf->scratch;
 189
 190   if ((0xe0 & h->prim_poly) == 0){
 191     SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_2)
 192     SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_2)
 193   }else if ((0xc0 & h->prim_poly) == 0){
 194     SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_3)
 195     SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_3)
 196   }else if ((0x80 & h->prim_poly) == 0){
 197     SET_FUNCTION(gf,multiply,w32,gf_w8_neon_clm_multiply_4)
 198     SET_FUNCTION(gf,multiply_region,w32,gf_w8_neon_clm_multiply_region_from_single_4)
 199   }else{
 200     return 0;
 201   }
 202   return 1;
 203 }
 204
 205 #ifndef ARCH_AARCH64
 206 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)),   \
 207                                        vtbl2_u8(tbl, vget_high_u8(v)))
 208 #endif
 209
 210 static
 211 void
 212 gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
 213 {
 214   uint8_t *bh, *bl, *sptr, *dptr;
 215   uint8x16_t r, va, vh, vl, loset;
 216 #ifdef ARCH_AARCH64
 217   uint8x16_t mth, mtl;
 218 #else
 219   uint8x8x2_t mth, mtl;
 220 #endif
 221   struct gf_w8_half_table_data *htd;
 222   gf_region_data rd;
 223
 224   if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
 225   if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
 226
 227   htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
 228
 229   gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
 230   gf_do_initial_region_alignment(&rd);
 231
 232   bh = (uint8_t *) htd->high;
 233   bh += (val << 4);
 234   bl = (uint8_t *) htd->low;
 235   bl += (val << 4);
 236
 237   sptr = rd.s_start;
 238   dptr = rd.d_start;
 239
 240 #ifdef ARCH_AARCH64
 241   mth = vld1q_u8 (bh);
 242   mtl = vld1q_u8 (bl);
 243 #else
 244   mth.val[0] = vld1_u8 (bh);
 245   mtl.val[0] = vld1_u8 (bl);
 246   mth.val[1] = vld1_u8 (bh + 8);
 247   mtl.val[1] = vld1_u8 (bl + 8);
 248 #endif
 249
 250   loset = vdupq_n_u8(0xf);
 251
 252   if (xor) {
 253     while (sptr < (uint8_t *) rd.s_top) {
 254       va = vld1q_u8 (sptr);
 255
 256       vh = vshrq_n_u8 (va, 4);
 257       vl = vandq_u8 (va, loset);
 258       va = vld1q_u8 (dptr);
 259
 260       vh = vqtbl1q_u8 (mth, vh);
 261       vl = vqtbl1q_u8 (mtl, vl);
 262
 263       r = veorq_u8 (vh, vl);
 264
 265       vst1q_u8 (dptr, veorq_u8 (va, r));
 266
 267       dptr += 16;
 268       sptr += 16;
 269     }
 270   } else {
 271     while (sptr < (uint8_t *) rd.s_top) {
 272       va = vld1q_u8 (sptr);
 273
 274       vh = vshrq_n_u8 (va, 4);
 275       vl = vandq_u8 (va, loset);
 276 #ifdef ARCH_AARCH64
 277       vh = vqtbl1q_u8 (mth, vh);
 278       vl = vqtbl1q_u8 (mtl, vl);
 279 #else
 280       vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
 281                         vtbl2_u8 (mth, vget_high_u8 (vh)));
 282       vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
 283                         vtbl2_u8 (mtl, vget_high_u8 (vl)));
 284 #endif
 285
 286       r = veorq_u8 (vh, vl);
 287
 288       vst1q_u8(dptr, r);
 289
 290       dptr += 16;
 291       sptr += 16;
 292     }
 293   }
 294
 295   gf_do_final_region_alignment(&rd);
 296 }
 297
 298
 299 void gf_w8_neon_split_init(gf_t *gf)
 300 {
 301   SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_neon)
 302 }