1 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
3 Inspired by Intel Approximate Math library, and based on the
4 corresponding algorithms of the cephes math library
6 The default is to use the SSE1 version. If you define USE_SSE2 the
7 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
8 not expect any significant performance improvement with SSE2.
11 /* Copyright (C) 2007 Julien Pommier
13 This software is provided 'as-is', without any express or implied
14 warranty. In no event will the authors be held liable for any damages
15 arising from the use of this software.
17 Permission is granted to anyone to use this software for any purpose,
18 including commercial applications, and to alter it and redistribute it
19 freely, subject to the following restrictions:
21 1. The origin of this software must not be misrepresented; you must not
22 claim that you wrote the original software. If you use this software
23 in a product, an acknowledgment in the product documentation would be
24 appreciated but is not required.
25 2. Altered source versions must be plainly marked as such, and must not be
26 misrepresented as being the original software.
27 3. This notice may not be removed or altered from any source distribution.
29 (this is the zlib license)
32 #include <xmmintrin.h>
34 /* yes I know, the top of this file is quite ugly */
36 #define USE_SSE2 // use SSE2 version
38 #ifdef _MSC_VER /* visual c++ */
39 # define ALIGN16_BEG __declspec(align(16))
41 #else /* gcc or icc */
43 # define ALIGN16_END __attribute__((aligned(16)))
46 /* __m128 is ugly to write */
47 typedef __m128 v4sf; // vector of 4 float (sse1)
50 # include <emmintrin.h>
51 typedef __m128i v4si; // vector of 4 int (sse2)
53 typedef __m64 v2si; // vector of 2 int (mmx)
56 /* declare some SSE constants -- why can't I figure a better way to do that? */
57 #define _PS_CONST(Name, Val) \
58 static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
59 #define _PI32_CONST(Name, Val) \
60 static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
61 #define _PS_CONST_TYPE(Name, Type, Val) \
62 static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
66 /* the smallest non denormalized float number */
67 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
68 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
69 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
71 _PS_CONST_TYPE(sign_mask, int, 0x80000000);
72 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
75 _PI32_CONST(inv1, ~1);
78 _PI32_CONST(0x7f, 0x7f);
80 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
81 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
82 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
83 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
84 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
85 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
86 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
87 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
88 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
89 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
90 _PS_CONST(cephes_log_q1, -2.12194440e-4);
91 _PS_CONST(cephes_log_q2, 0.693359375);
93 #if defined (__MINGW32__)
95 /* the ugly part below: many versions of gcc used to be completely buggy with respect to some intrinsics
96 The movehl_ps is fixed in mingw 3.4.5, but I found out that all the _mm_cmp* intrinsics were completely
97 broken on my mingw gcc 3.4.5 ...
99 Note that the bug on _mm_cmp* does occur only at -O0 optimization level
102 inline __m128 my_movehl_ps(__m128 a, const __m128 b) {
109 #warning "redefined _mm_movehl_ps (see gcc bug 21179)"
110 #define _mm_movehl_ps my_movehl_ps
112 inline __m128 my_cmplt_ps(__m128 a, const __m128 b) {
120 inline __m128 my_cmpgt_ps(__m128 a, const __m128 b) {
128 inline __m128 my_cmpeq_ps(__m128 a, const __m128 b) {
136 #warning "redefined _mm_cmpxx_ps functions..."
137 #define _mm_cmplt_ps my_cmplt_ps
138 #define _mm_cmpgt_ps my_cmpgt_ps
139 #define _mm_cmpeq_ps my_cmpeq_ps
143 typedef union xmm_mm_union {
148 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
149 xmm_mm_union u; u.xmm = xmm_; \
154 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
155 xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
160 /* natural logarithm computed for 4 simultaneous float
161 return NaN for x <= 0
163 v4sf log_ps(v4sf x) {
169 v4sf one = *(v4sf*)_ps_1;
171 v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
173 x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
176 /* part 1: x = frexpf(x, &e); */
177 COPY_XMM_TO_MM(x, mm0, mm1);
178 mm0 = _mm_srli_pi32(mm0, 23);
179 mm1 = _mm_srli_pi32(mm1, 23);
181 emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
183 /* keep only the fractional part */
184 x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
185 x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
188 /* now e=mm0:mm1 contain the really base-2 exponent */
189 mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
190 mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
191 v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
192 _mm_empty(); /* bye bye mmx */
194 emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
195 v4sf e = _mm_cvtepi32_ps(emm0);
198 e = _mm_add_ps(e, one);
204 } else { x = x - 1.0; }
206 v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
207 v4sf tmp = _mm_and_ps(x, mask);
208 x = _mm_sub_ps(x, one);
209 e = _mm_sub_ps(e, _mm_and_ps(one, mask));
210 x = _mm_add_ps(x, tmp);
213 v4sf z = _mm_mul_ps(x,x);
215 v4sf y = *(v4sf*)_ps_cephes_log_p0;
216 y = _mm_mul_ps(y, x);
217 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
218 y = _mm_mul_ps(y, x);
219 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
220 y = _mm_mul_ps(y, x);
221 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
222 y = _mm_mul_ps(y, x);
223 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
224 y = _mm_mul_ps(y, x);
225 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
226 y = _mm_mul_ps(y, x);
227 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
228 y = _mm_mul_ps(y, x);
229 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
230 y = _mm_mul_ps(y, x);
231 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
232 y = _mm_mul_ps(y, x);
234 y = _mm_mul_ps(y, z);
237 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
238 y = _mm_add_ps(y, tmp);
241 tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
242 y = _mm_sub_ps(y, tmp);
244 tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
245 x = _mm_add_ps(x, y);
246 x = _mm_add_ps(x, tmp);
247 x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
251 _PS_CONST(exp_hi, 88.3762626647949f);
252 _PS_CONST(exp_lo, -88.3762626647949f);
254 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
255 _PS_CONST(cephes_exp_C1, 0.693359375);
256 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
258 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
259 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
260 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
261 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
262 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
263 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
265 v4sf exp_ps(v4sf x) {
266 v4sf tmp = _mm_setzero_ps(), fx;
272 v4sf one = *(v4sf*)_ps_1;
274 x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
275 x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
277 /* express exp(x) as exp(g + n*log(2)) */
278 fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
279 fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
281 /* how to perform a floorf with SSE: just below */
283 /* step 1 : cast to int */
284 tmp = _mm_movehl_ps(tmp, fx);
285 mm0 = _mm_cvttps_pi32(fx);
286 mm1 = _mm_cvttps_pi32(tmp);
287 /* step 2 : cast back to float */
288 tmp = _mm_cvtpi32x2_ps(mm0, mm1);
290 emm0 = _mm_cvttps_epi32(fx);
291 tmp = _mm_cvtepi32_ps(emm0);
293 /* if greater, substract 1 */
294 v4sf mask = _mm_cmpgt_ps(tmp, fx);
295 mask = _mm_and_ps(mask, one);
296 fx = _mm_sub_ps(tmp, mask);
298 tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
299 v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
300 x = _mm_sub_ps(x, tmp);
301 x = _mm_sub_ps(x, z);
305 v4sf y = *(v4sf*)_ps_cephes_exp_p0;
306 y = _mm_mul_ps(y, x);
307 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
308 y = _mm_mul_ps(y, x);
309 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
310 y = _mm_mul_ps(y, x);
311 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
312 y = _mm_mul_ps(y, x);
313 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
314 y = _mm_mul_ps(y, x);
315 y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
316 y = _mm_mul_ps(y, z);
317 y = _mm_add_ps(y, x);
318 y = _mm_add_ps(y, one);
322 z = _mm_movehl_ps(z, fx);
323 mm0 = _mm_cvttps_pi32(fx);
324 mm1 = _mm_cvttps_pi32(z);
325 mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
326 mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
327 mm0 = _mm_slli_pi32(mm0, 23);
328 mm1 = _mm_slli_pi32(mm1, 23);
331 COPY_MM_TO_XMM(mm0, mm1, pow2n);
334 emm0 = _mm_cvttps_epi32(fx);
335 emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
336 emm0 = _mm_slli_epi32(emm0, 23);
337 v4sf pow2n = _mm_castsi128_ps(emm0);
339 y = _mm_mul_ps(y, pow2n);
343 _PS_CONST(minus_cephes_DP1, -0.78515625);
344 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
345 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
346 _PS_CONST(sincof_p0, -1.9515295891E-4);
347 _PS_CONST(sincof_p1, 8.3321608736E-3);
348 _PS_CONST(sincof_p2, -1.6666654611E-1);
349 _PS_CONST(coscof_p0, 2.443315711809948E-005);
350 _PS_CONST(coscof_p1, -1.388731625493765E-003);
351 _PS_CONST(coscof_p2, 4.166664568298827E-002);
352 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
355 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
356 it runs also on old athlons XPs and the pentium III of your grand
359 The code is the exact rewriting of the cephes sinf function.
360 Precision is excellent as long as x < 8192 (I did not bother to
361 take into account the special handling they have for greater values
362 -- it does not return garbage for arguments over 8192, though, but
363 the extra precision is missing).
365 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
366 surprising but correct result.
368 Performance is also surprisingly good, 1.33 times faster than the
369 macos vsinf SSE2 function, and 1.5 times faster than the
370 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
371 too bad for an SSE1 function (with no special tuning) !
372 However the latter libraries probably have a much better handling of NaN,
373 Inf, denormalized and other special arguments..
375 On my core 1 duo, the execution of this function takes approximately 95 cycles.
377 From what I have observed on the experiments with Intel AMath lib, switching to an
378 SSE2 version would improve the perf by only 10%.
380 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
383 v4sf sin_ps(v4sf x) { // any x
384 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
389 v2si mm0, mm1, mm2, mm3;
392 /* take the absolute value */
393 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
394 /* extract the sign bit (upper one) */
395 sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
398 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
400 //printf("plop:"); print4(y);
402 /* store the integer part of y in mm0 */
403 emm2 = _mm_cvttps_epi32(y);
404 /* j=(j+1) & (~1) (see the cephes sources) */
405 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
406 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
407 y = _mm_cvtepi32_ps(emm2);
408 /* get the swap sign flag */
409 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
410 emm0 = _mm_slli_epi32(emm0, 29);
411 /* get the polynom selection mask
412 there is one polynom for 0 <= x <= Pi/4
413 and another one for Pi/4<x<=Pi/2
415 Both branches will be computed.
417 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
418 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
420 v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
421 v4sf poly_mask = _mm_castsi128_ps(emm2);
422 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
424 /* store the integer part of y in mm0:mm1 */
425 xmm2 = _mm_movehl_ps(xmm2, y);
426 mm2 = _mm_cvttps_pi32(y);
427 mm3 = _mm_cvttps_pi32(xmm2);
428 /* j=(j+1) & (~1) (see the cephes sources) */
429 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
430 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
431 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
432 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
433 y = _mm_cvtpi32x2_ps(mm2, mm3);
434 /* get the swap sign flag */
435 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
436 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
437 mm0 = _mm_slli_pi32(mm0, 29);
438 mm1 = _mm_slli_pi32(mm1, 29);
439 /* get the polynom selection mask */
440 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
441 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
442 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
443 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
444 v4sf swap_sign_bit, poly_mask;
445 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
446 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
447 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
448 _mm_empty(); /* good-bye mmx */
451 /* The magic pass: "Extended precision modular arithmetic"
452 x = ((x - y * DP1) - y * DP2) - y * DP3; */
453 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
454 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
455 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
456 xmm1 = _mm_mul_ps(y, xmm1);
457 xmm2 = _mm_mul_ps(y, xmm2);
458 xmm3 = _mm_mul_ps(y, xmm3);
459 x = _mm_add_ps(x, xmm1);
460 x = _mm_add_ps(x, xmm2);
461 x = _mm_add_ps(x, xmm3);
463 /* Evaluate the first polynom (0 <= x <= Pi/4) */
464 y = *(v4sf*)_ps_coscof_p0;
465 v4sf z = _mm_mul_ps(x,x);
467 y = _mm_mul_ps(y, z);
468 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
469 y = _mm_mul_ps(y, z);
470 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
471 y = _mm_mul_ps(y, z);
472 y = _mm_mul_ps(y, z);
473 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
474 y = _mm_sub_ps(y, tmp);
475 y = _mm_add_ps(y, *(v4sf*)_ps_1);
477 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
479 v4sf y2 = *(v4sf*)_ps_sincof_p0;
480 y2 = _mm_mul_ps(y2, z);
481 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
482 y2 = _mm_mul_ps(y2, z);
483 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
484 y2 = _mm_mul_ps(y2, z);
485 y2 = _mm_mul_ps(y2, x);
486 y2 = _mm_add_ps(y2, x);
488 /* select the correct result from the two polynoms */
490 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
491 y = _mm_andnot_ps(xmm3, y);
492 y = _mm_add_ps(y,y2);
493 /* update the sign */
494 y = _mm_xor_ps(y, sign_bit);
499 /* almost the same as sin_ps */
500 v4sf cos_ps(v4sf x) { // any x
501 v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
505 v2si mm0, mm1, mm2, mm3;
507 /* take the absolute value */
508 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
511 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
514 /* store the integer part of y in mm0 */
515 emm2 = _mm_cvttps_epi32(y);
516 /* j=(j+1) & (~1) (see the cephes sources) */
517 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
518 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
519 y = _mm_cvtepi32_ps(emm2);
521 emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
523 /* get the swap sign flag */
524 emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
525 emm0 = _mm_slli_epi32(emm0, 29);
526 /* get the polynom selection mask */
527 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
528 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
530 v4sf sign_bit = _mm_castsi128_ps(emm0);
531 v4sf poly_mask = _mm_castsi128_ps(emm2);
533 /* store the integer part of y in mm0:mm1 */
534 xmm2 = _mm_movehl_ps(xmm2, y);
535 mm2 = _mm_cvttps_pi32(y);
536 mm3 = _mm_cvttps_pi32(xmm2);
538 /* j=(j+1) & (~1) (see the cephes sources) */
539 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
540 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
541 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
542 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
544 y = _mm_cvtpi32x2_ps(mm2, mm3);
547 mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
548 mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
550 /* get the swap sign flag in mm0:mm1 and the
551 polynom selection mask in mm2:mm3 */
553 mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
554 mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
555 mm0 = _mm_slli_pi32(mm0, 29);
556 mm1 = _mm_slli_pi32(mm1, 29);
558 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
559 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
561 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
562 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
564 v4sf sign_bit, poly_mask;
565 COPY_MM_TO_XMM(mm0, mm1, sign_bit);
566 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
567 _mm_empty(); /* good-bye mmx */
569 /* The magic pass: "Extended precision modular arithmetic"
570 x = ((x - y * DP1) - y * DP2) - y * DP3; */
571 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
572 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
573 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
574 xmm1 = _mm_mul_ps(y, xmm1);
575 xmm2 = _mm_mul_ps(y, xmm2);
576 xmm3 = _mm_mul_ps(y, xmm3);
577 x = _mm_add_ps(x, xmm1);
578 x = _mm_add_ps(x, xmm2);
579 x = _mm_add_ps(x, xmm3);
581 /* Evaluate the first polynom (0 <= x <= Pi/4) */
582 y = *(v4sf*)_ps_coscof_p0;
583 v4sf z = _mm_mul_ps(x,x);
585 y = _mm_mul_ps(y, z);
586 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
587 y = _mm_mul_ps(y, z);
588 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
589 y = _mm_mul_ps(y, z);
590 y = _mm_mul_ps(y, z);
591 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
592 y = _mm_sub_ps(y, tmp);
593 y = _mm_add_ps(y, *(v4sf*)_ps_1);
595 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
597 v4sf y2 = *(v4sf*)_ps_sincof_p0;
598 y2 = _mm_mul_ps(y2, z);
599 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
600 y2 = _mm_mul_ps(y2, z);
601 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
602 y2 = _mm_mul_ps(y2, z);
603 y2 = _mm_mul_ps(y2, x);
604 y2 = _mm_add_ps(y2, x);
606 /* select the correct result from the two polynoms */
608 y2 = _mm_and_ps(xmm3, y2); //, xmm3);
609 y = _mm_andnot_ps(xmm3, y);
610 y = _mm_add_ps(y,y2);
611 /* update the sign */
612 y = _mm_xor_ps(y, sign_bit);
617 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
618 it is almost as fast, and gives you a free cosine with your sine */
619 void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
620 v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
622 v4si emm0, emm2, emm4;
624 v2si mm0, mm1, mm2, mm3, mm4, mm5;
627 /* take the absolute value */
628 x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
629 /* extract the sign bit (upper one) */
630 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
633 y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
636 /* store the integer part of y in emm2 */
637 emm2 = _mm_cvttps_epi32(y);
639 /* j=(j+1) & (~1) (see the cephes sources) */
640 emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
641 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
642 y = _mm_cvtepi32_ps(emm2);
646 /* get the swap sign flag for the sine */
647 emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
648 emm0 = _mm_slli_epi32(emm0, 29);
649 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
651 /* get the polynom selection mask for the sine*/
652 emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
653 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
654 v4sf poly_mask = _mm_castsi128_ps(emm2);
656 /* store the integer part of y in mm2:mm3 */
657 xmm3 = _mm_movehl_ps(xmm3, y);
658 mm2 = _mm_cvttps_pi32(y);
659 mm3 = _mm_cvttps_pi32(xmm3);
661 /* j=(j+1) & (~1) (see the cephes sources) */
662 mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
663 mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
664 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
665 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
667 y = _mm_cvtpi32x2_ps(mm2, mm3);
672 /* get the swap sign flag for the sine */
673 mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
674 mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
675 mm0 = _mm_slli_pi32(mm0, 29);
676 mm1 = _mm_slli_pi32(mm1, 29);
677 v4sf swap_sign_bit_sin;
678 COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
680 /* get the polynom selection mask for the sine */
682 mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
683 mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
684 mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
685 mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
687 COPY_MM_TO_XMM(mm2, mm3, poly_mask);
690 /* The magic pass: "Extended precision modular arithmetic"
691 x = ((x - y * DP1) - y * DP2) - y * DP3; */
692 xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
693 xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
694 xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
695 xmm1 = _mm_mul_ps(y, xmm1);
696 xmm2 = _mm_mul_ps(y, xmm2);
697 xmm3 = _mm_mul_ps(y, xmm3);
698 x = _mm_add_ps(x, xmm1);
699 x = _mm_add_ps(x, xmm2);
700 x = _mm_add_ps(x, xmm3);
703 emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
704 emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
705 emm4 = _mm_slli_epi32(emm4, 29);
706 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
708 /* get the sign flag for the cosine */
709 mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
710 mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
711 mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
712 mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
713 mm4 = _mm_slli_pi32(mm4, 29);
714 mm5 = _mm_slli_pi32(mm5, 29);
716 COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
717 _mm_empty(); /* good-bye mmx */
720 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
723 /* Evaluate the first polynom (0 <= x <= Pi/4) */
724 v4sf z = _mm_mul_ps(x,x);
725 y = *(v4sf*)_ps_coscof_p0;
727 y = _mm_mul_ps(y, z);
728 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
729 y = _mm_mul_ps(y, z);
730 y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
731 y = _mm_mul_ps(y, z);
732 y = _mm_mul_ps(y, z);
733 v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
734 y = _mm_sub_ps(y, tmp);
735 y = _mm_add_ps(y, *(v4sf*)_ps_1);
737 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
739 v4sf y2 = *(v4sf*)_ps_sincof_p0;
740 y2 = _mm_mul_ps(y2, z);
741 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
742 y2 = _mm_mul_ps(y2, z);
743 y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
744 y2 = _mm_mul_ps(y2, z);
745 y2 = _mm_mul_ps(y2, x);
746 y2 = _mm_add_ps(y2, x);
748 /* select the correct result from the two polynoms */
750 v4sf ysin2 = _mm_and_ps(xmm3, y2);
751 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
752 y2 = _mm_sub_ps(y2,ysin2);
753 y = _mm_sub_ps(y, ysin1);
755 xmm1 = _mm_add_ps(ysin1,ysin2);
756 xmm2 = _mm_add_ps(y,y2);
758 /* update the sign */
759 *s = _mm_xor_ps(xmm1, sign_bit_sin);
760 *c = _mm_xor_ps(xmm2, sign_bit_cos);