/* Vectorized routines for Intel/AMD, using Streaming SIMD Extensions (SSE). * * Table of contents * 1. SIMD logf(), expf() * 2. Utilities for ps vectors (4 floats in a __m128) * 3. Utilities for epu8 vectors (16 uchars in a __m128i) * 3. Benchmark * 4. Unit tests * 5. Test driver * 6. Example * 7. Copyright and license * * SRE, Sun Dec 16 09:14:51 2007 [Janelia] * SVN $Id: esl_sse.c 524 2010-02-22 01:47:15Z eddys $ ***************************************************************** * Credits: * * The logf() and expf() routines are derivatives of routines by * Julien Pommier [http://gruntthepeon.free.fr/ssemath/]. Those * routines were in turn based on serial implementations in the Cephes * math library by Stephen Moshier [Moshier89; * http://www.moshier.net/#Cephes]. Thanks and credit to both Moshier * and Pommier for their clear code. Additional copyright and license * information is appended at the end of the file. */ #include "esl_config.h" #ifdef HAVE_SSE2 #include #include #include #include #include /* SSE */ #include /* SSE2 */ #include "easel.h" #include "esl_sse.h" /***************************************************************** * 1. SSE SIMD logf(), expf() *****************************************************************/ /* As of Dec 2007, I am unaware of any plans for Intel/AMD to release * SSE intrinsics for logf(), expf(), or other special functions. * * I need them, and the code below should suffice. If you know of * better ways to compute these functions, please let me know. */ /* Function: esl_sse_logf() * Synopsis: * Incept: SRE, Fri Dec 14 11:32:54 2007 [Janelia] * * Purpose: Given a vector containing four floats, returns a * vector in which each element . * * Valid in the domain $x_z > 0$ for normalized IEEE754 * $x_z$. * * For $< 0$, including -0, returns . For $== * 0$ or subnormal , returns <-inf>. For , * returns . For , returns . For * subnormal , returns <-inf>. * * Xref: J2/71. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2 and added handling * of IEEE754 specials. */ __m128 esl_sse_logf(__m128 x) { static float cephes_p[9] = { 7.0376836292E-2f, -1.1514610310E-1f, 1.1676998740E-1f, -1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f, 2.0000714765E-1f, -2.4999993993E-1f, 3.3333331174E-1f }; __m128 onev = _mm_set1_ps(1.0f); /* all elem = 1.0 */ __m128 v0p5 = _mm_set1_ps(0.5f); /* all elem = 0.5 */ __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */ __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */ __m128i ei; __m128 e; __m128 invalid_mask, zero_mask, inf_mask; /* masks used to handle special IEEE754 inputs */ __m128 mask; __m128 origx; __m128 tmp; __m128 y; __m128 z; /* first, split x apart: x = frexpf(x, &e); */ ei = _mm_srli_epi32( _mm_castps_si128(x), 23); /* shift right 23: IEEE754 floats: ei = biased exponents */ invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg)); /* mask any elem that's negative; these become NaN */ zero_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128())); /* mask any elem zero or subnormal; these become -inf */ inf_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp)); /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN */ origx = x; /* store original x, used for log(inf) = inf, log(NaN) = NaN */ x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand */ x = _mm_or_ps (x, v0p5); /* sets hidden bit b[0] */ ei = _mm_sub_epi32(ei, _mm_set1_epi32(126)); /* -127 (ei now signed base-2 exponent); then +1 */ e = _mm_cvtepi32_ps(ei); /* now, calculate the log */ mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches. */ tmp = _mm_and_ps(x, mask); /* tmp contains x values < 0.707, else 0 */ x = _mm_sub_ps(x, onev); e = _mm_sub_ps(e, _mm_and_ps(onev, mask)); x = _mm_add_ps(x, tmp); z = _mm_mul_ps(x,x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8])); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f)); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, v0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f)); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); /* IEEE754 cleanup: */ x = esl_sse_select_ps(x, origx, inf_mask); /* log(inf)=inf; log(NaN) = NaN */ x = _mm_or_ps(x, invalid_mask); /* log(x<0, including -0,-inf) = NaN */ x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal = -inf */ return x; } /* Function: esl_sse_expf() * Synopsis: * Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia] * * Purpose: Given a vector containing four floats, returns a * vector in which each element . * * Valid for all IEEE754 floats $x_z$. * * Xref: J2/71 * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2. */ __m128 esl_sse_expf(__m128 x) { static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f }; static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f }; static float maxlogf = 88.72283905206835; /* log(2^128) */ static float minlogf = -103.27892990343185; /* log(2^-149) */ __m128i k; __m128 mask, tmp, fx, z, y, minmask, maxmask; /* handle out-of-range and special conditions */ maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf)); minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf)); /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */ fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R)); fx = _mm_add_ps(fx, _mm_set1_ps(0.5f)); /* floorf() with SSE: */ k = _mm_cvttps_epi32(fx); /* cast to int with truncation */ tmp = _mm_cvtepi32_ps(k); /* cast back to float */ mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */ mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */ fx = _mm_sub_ps(tmp, mask); /* then subtract one. */ k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */ /* polynomial approx for e^f for f in range [-0.5, 0.5] */ tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0])); z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1])); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x, x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(1.0f)); /* build 2^k by hand, by creating a IEEE754 float */ k = _mm_add_epi32(k, _mm_set1_epi32(127)); k = _mm_slli_epi32(k, 23); fx = _mm_castsi128_ps(k); /* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */ y = _mm_mul_ps(y, fx); /* special/range cleanup */ y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */ y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */ return y; } /***************************************************************** * 2. Utilities for ps vectors (4 floats in a __m128) *****************************************************************/ void esl_sse_dump_ps(FILE *fp, __m128 v) { float *p = (float *)&v; fprintf(fp, "[%13.8g, %13.8g, %13.8g, %13.8g]", p[0], p[1], p[2], p[3]); } /***************************************************************** * 3. Benchmark *****************************************************************/ #ifdef eslSSE_BENCHMARK /* gcc -msse2 -O3 -o benchmark-sse -I ~/src/hmmer/easel -L ~/src/hmmer/easel -DeslSSE_BENCHMARK -DHAVE_SSE2 esl_sse.c -leasel -lm */ #include "esl_config.h" #include #include #include "easel.h" #include "esl_getopts.h" #include "esl_stopwatch.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-N", eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options]"; static char banner[] = "benchmark driver for sse module"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_STOPWATCH *w = esl_stopwatch_Create(); int N = esl_opt_GetInteger(go, "-N"); float origx = 2.0; float x = origx; __m128 xv = _mm_set1_ps(x); int i; /* First, serial time. */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) { x = logf(x); x = expf(x); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# serial CPU time: "); /* Vector time */ esl_stopwatch_Start(w); for (i = 0; i < N; i++) { xv = esl_sse_logf(xv); xv = esl_sse_expf(xv); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# vector CPU time: "); /* If you don't do something with x and xv, the compiler may optimize them away */ printf("%g => many scalar logf,expf cycles => %g\n", origx, N, x); printf("%g => many vector logf,expf cycles => ", origx, N); esl_sse_dump_ps(stdout, xv); printf("\n"); esl_stopwatch_Destroy(w); esl_getopts_Destroy(go); return 0; } #endif /*eslSSE_BENCHMARK*/ /***************************************************************** * 4. Unit tests *****************************************************************/ #ifdef eslSSE_TESTDRIVE #include "esl_getopts.h" #include "esl_random.h" /* utest_logf(): Test range/domain of logf */ static void utest_logf(ESL_GETOPTS *go) { __m128 x; /* test input */ union { __m128 v; float x[4]; } r; /* test output */ /* Test IEEE754 specials: * log(-inf) = NaN log(x<0) = NaN log(-0) = NaN * log(0) = -inf log(inf) = inf log(NaN) = NaN */ x = _mm_set_ps(0.0, -0.0, -1.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */ r.v = esl_sse_logf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("logf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (! isnan(r.x[0])) esl_fatal("logf(-inf) should be NaN"); if (! isnan(r.x[1])) esl_fatal("logf(-1) should be NaN"); if (! isnan(r.x[2])) esl_fatal("logf(-0) should be NaN"); if (! (r.x[3] < 0 && isinf(r.x[3]))) esl_fatal("logf(0) should be -inf"); x = _mm_set_ps(FLT_MAX, FLT_MIN, eslNaN, eslINFINITY); r.v = esl_sse_logf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("logf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (! isinf(r.x[0])) esl_fatal("logf(inf) should be inf"); if (! isnan(r.x[1])) esl_fatal("logf(NaN) should be NaN"); } /* utest_expf(): Test range/domain of expf */ static void utest_expf(ESL_GETOPTS *go) { __m128 x; /* test input */ union { __m128 v; float x[4]; } r; /* test output */ /* exp(-inf) = 0 exp(-0) = 1 exp(0) = 1 exp(inf) = inf exp(NaN) = NaN */ x = _mm_set_ps(eslINFINITY, 0.0, -0.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */ r.v = esl_sse_expf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("expf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (r.x[0] != 0.0f) esl_fatal("expf(-inf) should be 0"); if (r.x[1] != 1.0f) esl_fatal("logf(-0) should be 1"); if (r.x[2] != 1.0f) esl_fatal("logf(0) should be 1"); if (! isinf(r.x[3])) esl_fatal("logf(inf) should be inf"); /* exp(NaN) = NaN exp(large) = inf exp(-large) = 0 exp(1) = exp(1) */ x = _mm_set_ps(1.0f, -666.0f, 666.0f, eslNaN); /* set_ps() is in order 3 2 1 0 */ r.v = esl_sse_expf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("expf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (! isnan(r.x[0])) esl_fatal("expf(NaN) should be NaN"); if (! isinf(r.x[1])) esl_fatal("expf(large x) should be inf"); if (r.x[2] != 0.0f) esl_fatal("expf(-large x) should be 0"); } /* utest_odds(): test accuracy of logf, expf on odds ratios, * our main intended use. */ static void utest_odds(ESL_GETOPTS *go, ESL_RANDOMNESS *r) { int N = esl_opt_GetInteger(go, "-N"); int verbose = esl_opt_GetBoolean(go, "-v"); int very_verbose = esl_opt_GetBoolean(go, "--vv"); int i; float p1, p2, odds; union { __m128 v; float x[4]; } r1; union { __m128 v; float x[4]; } r2; float scalar_r1, scalar_r2; double err1, maxerr1 = 0.0, avgerr1 = 0.0; /* errors on logf() */ double err2, maxerr2 = 0.0, avgerr2 = 0.0; /* errors on expf() */ for (i = 0; i < N; i++) { p1 = esl_rnd_UniformPositive(r); p2 = esl_rnd_UniformPositive(r); odds = p1 / p2; if (odds == 0.0) esl_fatal("whoa, odds ratio can't be 0!\n"); r1.v = esl_sse_logf(_mm_set1_ps(odds)); /* r1.x[z] = log(p1/p2) */ scalar_r1 = logf(odds); err1 = (r1.x[0] == 0. && scalar_r1 == 0.) ? 0.0 : 2 * fabs(r1.x[0] - scalar_r1) / fabs(r1.x[0] + scalar_r1); if (err1 > maxerr1) maxerr1 = err1; avgerr1 += err1 / (float) N; if (isnan(avgerr1)) esl_fatal("whoa, what?\n"); r2.v = esl_sse_expf(r1.v); /* and back to odds */ scalar_r2 = expf(r1.x[0]); err2 = (r2.x[0] == 0. && scalar_r2 == 0.) ? 0.0 : 2 * fabs(r2.x[0] - scalar_r2) / fabs(r2.x[0] + scalar_r2); if (err2 > maxerr2) maxerr2 = err2; avgerr2 += err2 / (float) N; if (very_verbose) printf("%13.7g %13.7g %13.7g %13.7g %13.7g %13.7g %13.7g\n", odds, scalar_r1, r1.x[0], scalar_r2, r2.x[0], err1, err2); } if (avgerr1 > 1e-8) esl_fatal("average error on logf() is intolerable\n"); if (maxerr1 > 1e-6) esl_fatal("maximum error on logf() is intolerable\n"); if (avgerr2 > 1e-8) esl_fatal("average error on expf() is intolerable\n"); if (maxerr2 > 1e-6) esl_fatal("maximum error on expf() is intolerable\n"); if (verbose) { printf("Average [max] logf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr1, maxerr1); printf("Average [max] expf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr2, maxerr2); printf("(random seed : %" PRIu32 ")\n", esl_randomness_GetSeed(r)); } } #endif /*eslSSE_TESTDRIVE*/ /***************************************************************** * 5. Test driver *****************************************************************/ #ifdef eslSSE_TESTDRIVE /* gcc -msse2 -g -Wall -o test -I. -L. -DeslSSE_TESTDRIVE esl_sse.c -leasel -lm */ #include "esl_config.h" #include #include #include "easel.h" #include "esl_getopts.h" #include "esl_random.h" #include "esl_sse.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of random test points", 0 }, { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, { "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be verbose: show test report", 0 }, { "--vv", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be very verbose: show individual test samples", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options]"; static char banner[] = "test driver for sse module"; int main(int argc, char **argv) { ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));; utest_logf(go); utest_expf(go); utest_odds(go, r); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; } #endif /* eslSSE_TESTDRIVE*/ /***************************************************************** * 6. Example *****************************************************************/ #ifdef eslSSE_EXAMPLE /*::cexcerpt::sse_example::begin::*/ /* gcc -msse2 -g -Wall -o example -I. -L. -DeslSSE_EXAMPLE esl_sse.c -leasel -lm */ #include "esl_config.h" #include #include #include "easel.h" #include "esl_sse.h" int main(int argc, char **argv) { float x; /* scalar input */ __m128 xv; /* input vector */ union { __m128 v; float x[4]; } rv; /* result vector*/ x = 2.0; xv = _mm_set1_ps(x); rv.v = esl_sse_logf(xv); printf("logf(%f) = %f\n", x, rv.x[0]); rv.v = esl_sse_expf(xv); printf("expf(%f) = %f\n", x, rv.x[0]); return 0; } /*::cexcerpt::sse_example::end::*/ #endif /*eslSSE_EXAMPLE*/ #else /* ! HAVE_SSE2*/ /* The remainder of the file is just bookkeeping, for what to do when * we aren't compiling with SSE instructions. */ /* Throw a bone to compilers that would warn about empty compilation unit: */ #include "easel.h" /* Provide a successful unit test on platforms where we don't have SSE instructions: */ #ifdef eslSSE_TESTDRIVE int main(void) { return 0; } #endif #endif /* HAVE_SSE2 or not*/ /***************************************************************** * Easel - a library of C functions for biological sequence analysis * Version h3.0; March 2010 * Copyright (C) 2010 Howard Hughes Medical Institute. * Other copyrights also apply. See the COPYRIGHT file for a full list. * * Easel is distributed under the Janelia Farm Software License, a BSD * license. See the LICENSE file for more details. *****************************************************************/ /* Additionally, esl_sse_logf() and esl_sse_expf() are * Copyright (C) 2007 Julien Pommier * Copyright (C) 1992 Stephen Moshier * * These functions derived from zlib-licensed routines by * Julien Pommier, http://gruntthepeon.free.fr/ssemath/. The * zlib license: */ /* Copyright (C) 2007 Julien Pommier This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. */ /* In turn, Pommier had derived the logf() and expf() functions from * serial versions in the Cephes math library. According to its * readme, Cephes is "copyrighted by the author" and "may be used * freely but it comes with no support or guarantee." Cephes is * available in NETLIB [http://www.netlib.org/cephes/]. NETLIB is * widely considered to be a free scientific code repository, though * the copyright and license status of many parts, including Cephes, * is ill-defined. We have attached Moshier's copyright, * to credit his original contribution. Thanks to both Pommier and * Moshier for their clear code. */