stc/exp/ml_stc_linux_make_v1.0/ml_stc_src/sse_mathfun.h

   1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
   2
   3    Inspired by Intel Approximate Math library, and based on the
   4    corresponding algorithms of the cephes math library
   5
   6    The default is to use the SSE1 version. If you define USE_SSE2 the
   7    the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
   8    not expect any significant performance improvement with SSE2.
   9 */
  10
  11 /* Copyright (C) 2007  Julien Pommier
  12
  13   This software is provided 'as-is', without any express or implied
  14   warranty.  In no event will the authors be held liable for any damages
  15   arising from the use of this software.
  16
  17   Permission is granted to anyone to use this software for any purpose,
  18   including commercial applications, and to alter it and redistribute it
  19   freely, subject to the following restrictions:
  20
  21   1. The origin of this software must not be misrepresented; you must not
  22      claim that you wrote the original software. If you use this software
  23      in a product, an acknowledgment in the product documentation would be
  24      appreciated but is not required.
  25   2. Altered source versions must be plainly marked as such, and must not be
  26      misrepresented as being the original software.
  27   3. This notice may not be removed or altered from any source distribution.
  28
  29   (this is the zlib license)
  30 */
  31
  32 #include <xmmintrin.h>
  33
  34 /* yes I know, the top of this file is quite ugly */
  35
  36 #define USE_SSE2 // use SSE2 version
  37
  38 #ifdef _MSC_VER /* visual c++ */
  39 # define ALIGN16_BEG __declspec(align(16))
  40 # define ALIGN16_END
  41 #else /* gcc or icc */
  42 # define ALIGN16_BEG
  43 # define ALIGN16_END __attribute__((aligned(16)))
  44 #endif
  45
  46 /* __m128 is ugly to write */
  47 typedef __m128 v4sf;  // vector of 4 float (sse1)
  48
  49 #ifdef USE_SSE2
  50 # include <emmintrin.h>
  51 typedef __m128i v4si; // vector of 4 int (sse2)
  52 #else
  53 typedef __m64 v2si;   // vector of 2 int (mmx)
  54 #endif
  55
  56 /* declare some SSE constants -- why can't I figure a better way to do that? */
  57 #define _PS_CONST(Name, Val)                                            \
  58   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  59 #define _PI32_CONST(Name, Val)                                            \
  60   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  61 #define _PS_CONST_TYPE(Name, Type, Val)                                 \
  62   static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
  63
  64 _PS_CONST(1  , 1.0f);
  65 _PS_CONST(0p5, 0.5f);
  66 /* the smallest non denormalized float number */
  67 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
  68 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
  69 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
  70
  71 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
  72 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
  73
  74 _PI32_CONST(1, 1);
  75 _PI32_CONST(inv1, ~1);
  76 _PI32_CONST(2, 2);
  77 _PI32_CONST(4, 4);
  78 _PI32_CONST(0x7f, 0x7f);
  79
  80 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
  81 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
  82 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
  83 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
  84 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
  85 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
  86 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
  87 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
  88 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
  89 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
  90 _PS_CONST(cephes_log_q1, -2.12194440e-4);
  91 _PS_CONST(cephes_log_q2, 0.693359375);
  92
  93 #if defined (__MINGW32__)
  94
  95 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
  96    The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
  97    broken on my mingw gcc 3.4.5 ...
  98
  99    Note that the bug on _mm_cmp* does occur only at -O0 optimization level
 100 */
 101
 102 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
 103         asm (
 104                         "movhlps %2,%0\n\t"
 105                         : "=x" (a)
 106                         : "0" (a), "x"(b)
 107             );
 108         return a;                                 }
 109 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
 110 #define _mm_movehl_ps my_movehl_ps
 111
 112 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
 113         asm (
 114                         "cmpltps %2,%0\n\t"
 115                         : "=x" (a)
 116                         : "0" (a), "x"(b)
 117             );
 118         return a;
 119                   }
 120 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
 121         asm (
 122                         "cmpnleps %2,%0\n\t"
 123                         : "=x" (a)
 124                         : "0" (a), "x"(b)
 125             );
 126         return a;
 127 }
 128 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
 129         asm (
 130                         "cmpeqps %2,%0\n\t"
 131                         : "=x" (a)
 132                         : "0" (a), "x"(b)
 133             );
 134         return a;
 135 }
 136 #warning "redefined _mm_cmpxx_ps functions..."
 137 #define _mm_cmplt_ps my_cmplt_ps
 138 #define _mm_cmpgt_ps my_cmpgt_ps
 139 #define _mm_cmpeq_ps my_cmpeq_ps
 140 #endif
 141
 142 #ifndef USE_SSE2
 143 typedef union xmm_mm_union {
 144   __m128 xmm;
 145   __m64 mm[2];
 146 } xmm_mm_union;
 147
 148 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
 149     xmm_mm_union u; u.xmm = xmm_;                   \
 150     mm0_ = u.mm[0];                                 \
 151     mm1_ = u.mm[1];                                 \
 152 }
 153
 154 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
 155     xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
 156   }
 157
 158 #endif // USE_SSE2
 159
 160 /* natural logarithm computed for 4 simultaneous float
 161    return NaN for x <= 0
 162 */
 163 v4sf log_ps(v4sf x) {
 164 #ifdef USE_SSE2
 165   v4si emm0;
 166 #else
 167   v2si mm0, mm1;
 168 #endif
 169   v4sf one = *(v4sf*)_ps_1;
 170
 171   v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
 172
 173   x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
 174
 175 #ifndef USE_SSE2
 176   /* part 1: x = frexpf(x, &e); */
 177   COPY_XMM_TO_MM(x, mm0, mm1);
 178   mm0 = _mm_srli_pi32(mm0, 23);
 179   mm1 = _mm_srli_pi32(mm1, 23);
 180 #else
 181   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 182 #endif
 183   /* keep only the fractional part */
 184   x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
 185   x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
 186
 187 #ifndef USE_SSE2
 188   /* now e=mm0:mm1 contain the really base-2 exponent */
 189   mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
 190   mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
 191   v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
 192   _mm_empty(); /* bye bye mmx */
 193 #else
 194   emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
 195   v4sf e = _mm_cvtepi32_ps(emm0);
 196 #endif
 197
 198   e = _mm_add_ps(e, one);
 199
 200   /* part2:
 201      if( x < SQRTHF ) {
 202        e -= 1;
 203        x = x + x - 1.0;
 204      } else { x = x - 1.0; }
 205   */
 206   v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
 207   v4sf tmp = _mm_and_ps(x, mask);
 208   x = _mm_sub_ps(x, one);
 209   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
 210   x = _mm_add_ps(x, tmp);
 211
 212
 213   v4sf z = _mm_mul_ps(x,x);
 214
 215   v4sf y = *(v4sf*)_ps_cephes_log_p0;
 216   y = _mm_mul_ps(y, x);
 217   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
 218   y = _mm_mul_ps(y, x);
 219   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
 220   y = _mm_mul_ps(y, x);
 221   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
 222   y = _mm_mul_ps(y, x);
 223   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
 224   y = _mm_mul_ps(y, x);
 225   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
 226   y = _mm_mul_ps(y, x);
 227   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
 228   y = _mm_mul_ps(y, x);
 229   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
 230   y = _mm_mul_ps(y, x);
 231   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
 232   y = _mm_mul_ps(y, x);
 233
 234   y = _mm_mul_ps(y, z);
 235
 236
 237   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
 238   y = _mm_add_ps(y, tmp);
 239
 240
 241   tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 242   y = _mm_sub_ps(y, tmp);
 243
 244   tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
 245   x = _mm_add_ps(x, y);
 246   x = _mm_add_ps(x, tmp);
 247   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
 248   return x;
 249 }
 250
 251 _PS_CONST(exp_hi,       88.3762626647949f);
 252 _PS_CONST(exp_lo,       -88.3762626647949f);
 253
 254 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
 255 _PS_CONST(cephes_exp_C1, 0.693359375);
 256 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
 257
 258 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
 259 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
 260 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
 261 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
 262 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
 263 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
 264
 265 v4sf exp_ps(v4sf x) {
 266   v4sf tmp = _mm_setzero_ps(), fx;
 267 #ifdef USE_SSE2
 268   v4si emm0;
 269 #else
 270   v2si mm0, mm1;
 271 #endif
 272   v4sf one = *(v4sf*)_ps_1;
 273
 274   x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
 275   x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
 276
 277   /* express exp(x) as exp(g + n*log(2)) */
 278   fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
 279   fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
 280
 281   /* how to perform a floorf with SSE: just below */
 282 #ifndef USE_SSE2
 283   /* step 1 : cast to int */
 284   tmp = _mm_movehl_ps(tmp, fx);
 285   mm0 = _mm_cvttps_pi32(fx);
 286   mm1 = _mm_cvttps_pi32(tmp);
 287   /* step 2 : cast back to float */
 288   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
 289 #else
 290   emm0 = _mm_cvttps_epi32(fx);
 291   tmp  = _mm_cvtepi32_ps(emm0);
 292 #endif
 293   /* if greater, substract 1 */
 294   v4sf mask = _mm_cmpgt_ps(tmp, fx);
 295   mask = _mm_and_ps(mask, one);
 296   fx = _mm_sub_ps(tmp, mask);
 297
 298   tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
 299   v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
 300   x = _mm_sub_ps(x, tmp);
 301   x = _mm_sub_ps(x, z);
 302
 303   z = _mm_mul_ps(x,x);
 304
 305   v4sf y = *(v4sf*)_ps_cephes_exp_p0;
 306   y = _mm_mul_ps(y, x);
 307   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
 308   y = _mm_mul_ps(y, x);
 309   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
 310   y = _mm_mul_ps(y, x);
 311   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
 312   y = _mm_mul_ps(y, x);
 313   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
 314   y = _mm_mul_ps(y, x);
 315   y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
 316   y = _mm_mul_ps(y, z);
 317   y = _mm_add_ps(y, x);
 318   y = _mm_add_ps(y, one);
 319
 320   /* build 2^n */
 321 #ifndef USE_SSE2
 322   z = _mm_movehl_ps(z, fx);
 323   mm0 = _mm_cvttps_pi32(fx);
 324   mm1 = _mm_cvttps_pi32(z);
 325   mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
 326   mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
 327   mm0 = _mm_slli_pi32(mm0, 23);
 328   mm1 = _mm_slli_pi32(mm1, 23);
 329
 330   v4sf pow2n;
 331   COPY_MM_TO_XMM(mm0, mm1, pow2n);
 332   _mm_empty();
 333 #else
 334   emm0 = _mm_cvttps_epi32(fx);
 335   emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
 336   emm0 = _mm_slli_epi32(emm0, 23);
 337   v4sf pow2n = _mm_castsi128_ps(emm0);
 338 #endif
 339   y = _mm_mul_ps(y, pow2n);
 340   return y;
 341 }
 342
 343 _PS_CONST(minus_cephes_DP1, -0.78515625);
 344 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 345 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 346 _PS_CONST(sincof_p0, -1.9515295891E-4);
 347 _PS_CONST(sincof_p1,  8.3321608736E-3);
 348 _PS_CONST(sincof_p2, -1.6666654611E-1);
 349 _PS_CONST(coscof_p0,  2.443315711809948E-005);
 350 _PS_CONST(coscof_p1, -1.388731625493765E-003);
 351 _PS_CONST(coscof_p2,  4.166664568298827E-002);
 352 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 353
 354
 355 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
 356    it runs also on old athlons XPs and the pentium III of your grand
 357    mother.
 358
 359    The code is the exact rewriting of the cephes sinf function.
 360    Precision is excellent as long as x < 8192 (I did not bother to
 361    take into account the special handling they have for greater values
 362    -- it does not return garbage for arguments over 8192, though, but
 363    the extra precision is missing).
 364
 365    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 366    surprising but correct result.
 367
 368    Performance is also surprisingly good, 1.33 times faster than the
 369    macos vsinf SSE2 function, and 1.5 times faster than the
 370    __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
 371    too bad for an SSE1 function (with no special tuning) !
 372    However the latter libraries probably have a much better handling of NaN,
 373    Inf, denormalized and other special arguments..
 374
 375    On my core 1 duo, the execution of this function takes approximately 95 cycles.
 376
 377    From what I have observed on the experiments with Intel AMath lib, switching to an
 378    SSE2 version would improve the perf by only 10%.
 379
 380    Since it is based on SSE intrinsics, it has to be compiled at -O2 to
 381    deliver full speed.
 382 */
 383 v4sf sin_ps(v4sf x) { // any x
 384   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 385
 386 #ifdef USE_SSE2
 387   v4si emm0, emm2;
 388 #else
 389   v2si mm0, mm1, mm2, mm3;
 390 #endif
 391   sign_bit = x;
 392   /* take the absolute value */
 393   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 394   /* extract the sign bit (upper one) */
 395   sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
 396
 397   /* scale by 4/Pi */
 398   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 399
 400   //printf("plop:"); print4(y);
 401 #ifdef USE_SSE2
 402   /* store the integer part of y in mm0 */
 403   emm2 = _mm_cvttps_epi32(y);
 404   /* j=(j+1) & (~1) (see the cephes sources) */
 405   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 406   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 407   y = _mm_cvtepi32_ps(emm2);
 408   /* get the swap sign flag */
 409   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 410   emm0 = _mm_slli_epi32(emm0, 29);
 411   /* get the polynom selection mask
 412      there is one polynom for 0 <= x <= Pi/4
 413      and another one for Pi/4<x<=Pi/2
 414
 415      Both branches will be computed.
 416   */
 417   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 418   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 419
 420   v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
 421   v4sf poly_mask = _mm_castsi128_ps(emm2);
 422   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 423 #else
 424   /* store the integer part of y in mm0:mm1 */
 425   xmm2 = _mm_movehl_ps(xmm2, y);
 426   mm2 = _mm_cvttps_pi32(y);
 427   mm3 = _mm_cvttps_pi32(xmm2);
 428   /* j=(j+1) & (~1) (see the cephes sources) */
 429   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 430   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 431   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 432   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 433   y = _mm_cvtpi32x2_ps(mm2, mm3);
 434   /* get the swap sign flag */
 435   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 436   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 437   mm0 = _mm_slli_pi32(mm0, 29);
 438   mm1 = _mm_slli_pi32(mm1, 29);
 439   /* get the polynom selection mask */
 440   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 441   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 442   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 443   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 444   v4sf swap_sign_bit, poly_mask;
 445   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
 446   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 447   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 448   _mm_empty(); /* good-bye mmx */
 449 #endif
 450
 451   /* The magic pass: "Extended precision modular arithmetic"
 452      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 453   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 454   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 455   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 456   xmm1 = _mm_mul_ps(y, xmm1);
 457   xmm2 = _mm_mul_ps(y, xmm2);
 458   xmm3 = _mm_mul_ps(y, xmm3);
 459   x = _mm_add_ps(x, xmm1);
 460   x = _mm_add_ps(x, xmm2);
 461   x = _mm_add_ps(x, xmm3);
 462
 463   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 464   y = *(v4sf*)_ps_coscof_p0;
 465   v4sf z = _mm_mul_ps(x,x);
 466
 467   y = _mm_mul_ps(y, z);
 468   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 469   y = _mm_mul_ps(y, z);
 470   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 471   y = _mm_mul_ps(y, z);
 472   y = _mm_mul_ps(y, z);
 473   v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 474   y = _mm_sub_ps(y, tmp);
 475   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 476
 477   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 478
 479   v4sf y2 = *(v4sf*)_ps_sincof_p0;
 480   y2 = _mm_mul_ps(y2, z);
 481   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 482   y2 = _mm_mul_ps(y2, z);
 483   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 484   y2 = _mm_mul_ps(y2, z);
 485   y2 = _mm_mul_ps(y2, x);
 486   y2 = _mm_add_ps(y2, x);
 487
 488   /* select the correct result from the two polynoms */
 489   xmm3 = poly_mask;
 490   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
 491   y = _mm_andnot_ps(xmm3, y);
 492   y = _mm_add_ps(y,y2);
 493   /* update the sign */
 494   y = _mm_xor_ps(y, sign_bit);
 495
 496   return y;
 497 }
 498
 499 /* almost the same as sin_ps */
 500 v4sf cos_ps(v4sf x) { // any x
 501   v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 502 #ifdef USE_SSE2
 503   v4si emm0, emm2;
 504 #else
 505   v2si mm0, mm1, mm2, mm3;
 506 #endif
 507   /* take the absolute value */
 508   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 509
 510   /* scale by 4/Pi */
 511   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 512
 513 #ifdef USE_SSE2
 514   /* store the integer part of y in mm0 */
 515   emm2 = _mm_cvttps_epi32(y);
 516   /* j=(j+1) & (~1) (see the cephes sources) */
 517   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 518   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 519   y = _mm_cvtepi32_ps(emm2);
 520
 521   emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
 522
 523   /* get the swap sign flag */
 524   emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
 525   emm0 = _mm_slli_epi32(emm0, 29);
 526   /* get the polynom selection mask */
 527   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 528   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 529
 530   v4sf sign_bit = _mm_castsi128_ps(emm0);
 531   v4sf poly_mask = _mm_castsi128_ps(emm2);
 532 #else
 533   /* store the integer part of y in mm0:mm1 */
 534   xmm2 = _mm_movehl_ps(xmm2, y);
 535   mm2 = _mm_cvttps_pi32(y);
 536   mm3 = _mm_cvttps_pi32(xmm2);
 537
 538   /* j=(j+1) & (~1) (see the cephes sources) */
 539   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 540   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 541   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 542   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 543
 544   y = _mm_cvtpi32x2_ps(mm2, mm3);
 545
 546
 547   mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
 548   mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
 549
 550   /* get the swap sign flag in mm0:mm1 and the
 551      polynom selection mask in mm2:mm3 */
 552
 553   mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
 554   mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
 555   mm0 = _mm_slli_pi32(mm0, 29);
 556   mm1 = _mm_slli_pi32(mm1, 29);
 557
 558   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 559   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 560
 561   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 562   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 563
 564   v4sf sign_bit, poly_mask;
 565   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
 566   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 567   _mm_empty(); /* good-bye mmx */
 568 #endif
 569   /* The magic pass: "Extended precision modular arithmetic"
 570      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 571   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 572   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 573   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 574   xmm1 = _mm_mul_ps(y, xmm1);
 575   xmm2 = _mm_mul_ps(y, xmm2);
 576   xmm3 = _mm_mul_ps(y, xmm3);
 577   x = _mm_add_ps(x, xmm1);
 578   x = _mm_add_ps(x, xmm2);
 579   x = _mm_add_ps(x, xmm3);
 580
 581   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 582   y = *(v4sf*)_ps_coscof_p0;
 583   v4sf z = _mm_mul_ps(x,x);
 584
 585   y = _mm_mul_ps(y, z);
 586   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 587   y = _mm_mul_ps(y, z);
 588   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 589   y = _mm_mul_ps(y, z);
 590   y = _mm_mul_ps(y, z);
 591   v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 592   y = _mm_sub_ps(y, tmp);
 593   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 594
 595   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 596
 597   v4sf y2 = *(v4sf*)_ps_sincof_p0;
 598   y2 = _mm_mul_ps(y2, z);
 599   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 600   y2 = _mm_mul_ps(y2, z);
 601   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 602   y2 = _mm_mul_ps(y2, z);
 603   y2 = _mm_mul_ps(y2, x);
 604   y2 = _mm_add_ps(y2, x);
 605
 606   /* select the correct result from the two polynoms */
 607   xmm3 = poly_mask;
 608   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
 609   y = _mm_andnot_ps(xmm3, y);
 610   y = _mm_add_ps(y,y2);
 611   /* update the sign */
 612   y = _mm_xor_ps(y, sign_bit);
 613
 614   return y;
 615 }
 616
 617 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
 618    it is almost as fast, and gives you a free cosine with your sine */
 619 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
 620   v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 621 #ifdef USE_SSE2
 622   v4si emm0, emm2, emm4;
 623 #else
 624   v2si mm0, mm1, mm2, mm3, mm4, mm5;
 625 #endif
 626   sign_bit_sin = x;
 627   /* take the absolute value */
 628   x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
 629   /* extract the sign bit (upper one) */
 630   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
 631
 632   /* scale by 4/Pi */
 633   y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
 634
 635 #ifdef USE_SSE2
 636   /* store the integer part of y in emm2 */
 637   emm2 = _mm_cvttps_epi32(y);
 638
 639   /* j=(j+1) & (~1) (see the cephes sources) */
 640   emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
 641   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
 642   y = _mm_cvtepi32_ps(emm2);
 643
 644   emm4 = emm2;
 645
 646   /* get the swap sign flag for the sine */
 647   emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
 648   emm0 = _mm_slli_epi32(emm0, 29);
 649   v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
 650
 651   /* get the polynom selection mask for the sine*/
 652   emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
 653   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 654   v4sf poly_mask = _mm_castsi128_ps(emm2);
 655 #else
 656   /* store the integer part of y in mm2:mm3 */
 657   xmm3 = _mm_movehl_ps(xmm3, y);
 658   mm2 = _mm_cvttps_pi32(y);
 659   mm3 = _mm_cvttps_pi32(xmm3);
 660
 661   /* j=(j+1) & (~1) (see the cephes sources) */
 662   mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
 663   mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
 664   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
 665   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
 666
 667   y = _mm_cvtpi32x2_ps(mm2, mm3);
 668
 669   mm4 = mm2;
 670   mm5 = mm3;
 671
 672   /* get the swap sign flag for the sine */
 673   mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
 674   mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
 675   mm0 = _mm_slli_pi32(mm0, 29);
 676   mm1 = _mm_slli_pi32(mm1, 29);
 677   v4sf swap_sign_bit_sin;
 678   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
 679
 680   /* get the polynom selection mask for the sine */
 681
 682   mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
 683   mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
 684   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
 685   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
 686   v4sf poly_mask;
 687   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 688 #endif
 689
 690   /* The magic pass: "Extended precision modular arithmetic"
 691      x = ((x - y * DP1) - y * DP2) - y * DP3; */
 692   xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
 693   xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
 694   xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
 695   xmm1 = _mm_mul_ps(y, xmm1);
 696   xmm2 = _mm_mul_ps(y, xmm2);
 697   xmm3 = _mm_mul_ps(y, xmm3);
 698   x = _mm_add_ps(x, xmm1);
 699   x = _mm_add_ps(x, xmm2);
 700   x = _mm_add_ps(x, xmm3);
 701
 702 #ifdef USE_SSE2
 703   emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
 704   emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
 705   emm4 = _mm_slli_epi32(emm4, 29);
 706   v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
 707 #else
 708   /* get the sign flag for the cosine */
 709   mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
 710   mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
 711   mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
 712   mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
 713   mm4 = _mm_slli_pi32(mm4, 29);
 714   mm5 = _mm_slli_pi32(mm5, 29);
 715   v4sf sign_bit_cos;
 716   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
 717   _mm_empty(); /* good-bye mmx */
 718 #endif
 719
 720   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
 721
 722
 723   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
 724   v4sf z = _mm_mul_ps(x,x);
 725   y = *(v4sf*)_ps_coscof_p0;
 726
 727   y = _mm_mul_ps(y, z);
 728   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
 729   y = _mm_mul_ps(y, z);
 730   y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
 731   y = _mm_mul_ps(y, z);
 732   y = _mm_mul_ps(y, z);
 733   v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
 734   y = _mm_sub_ps(y, tmp);
 735   y = _mm_add_ps(y, *(v4sf*)_ps_1);
 736
 737   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 738
 739   v4sf y2 = *(v4sf*)_ps_sincof_p0;
 740   y2 = _mm_mul_ps(y2, z);
 741   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
 742   y2 = _mm_mul_ps(y2, z);
 743   y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
 744   y2 = _mm_mul_ps(y2, z);
 745   y2 = _mm_mul_ps(y2, x);
 746   y2 = _mm_add_ps(y2, x);
 747
 748   /* select the correct result from the two polynoms */
 749   xmm3 = poly_mask;
 750   v4sf ysin2 = _mm_and_ps(xmm3, y2);
 751   v4sf ysin1 = _mm_andnot_ps(xmm3, y);
 752   y2 = _mm_sub_ps(y2,ysin2);
 753   y = _mm_sub_ps(y, ysin1);
 754
 755   xmm1 = _mm_add_ps(ysin1,ysin2);
 756   xmm2 = _mm_add_ps(y,y2);
 757
 758   /* update the sign */
 759   *s = _mm_xor_ps(xmm1, sign_bit_sin);
 760   *c = _mm_xor_ps(xmm2, sign_bit_cos);
 761 }
 762