350 lines
13 KiB
D
350 lines
13 KiB
D
/**
|
|
* Transcendental bonus functions.
|
|
*
|
|
* Copyright: Copyright Guillaumr Piolat 2016-2020.
|
|
* Copyright (C) 2007 Julien Pommier
|
|
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
|
*/
|
|
module inteli.math;
|
|
|
|
/* Copyright (C) 2007 Julien Pommier
|
|
|
|
This software is provided 'as-is', without any express or implied
|
|
warranty. In no event will the authors be held liable for any damages
|
|
arising from the use of this software.
|
|
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it
|
|
freely, subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must not
|
|
claim that you wrote the original software. If you use this software
|
|
in a product, an acknowledgment in the product documentation would be
|
|
appreciated but is not required.
|
|
2. Altered source versions must be plainly marked as such, and must not be
|
|
misrepresented as being the original software.
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
|
|
(this is the zlib license)
|
|
*/
|
|
import inteli.emmintrin;
|
|
import inteli.internals;
|
|
|
|
nothrow @nogc:
|
|
|
|
/// Natural `log` computed for a single 32-bit float.
|
|
/// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
|
|
/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
|
|
// #BONUS
|
|
float _mm_log_ss(float v) pure @safe
|
|
{
|
|
__m128 r = _mm_log_ps(_mm_set1_ps(v));
|
|
return r.array[0];
|
|
}
|
|
|
|
/// Natural logarithm computed for 4 simultaneous float.
|
|
/// This is an approximation, valid up to approximately -119dB of accuracy, on the range -inf..50
|
|
/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
|
|
// #BONUS
|
|
__m128 _mm_log_ps(__m128 x) pure @safe
|
|
{
|
|
static immutable __m128i _psi_inv_mant_mask = [~0x7f800000, ~0x7f800000, ~0x7f800000, ~0x7f800000];
|
|
static immutable __m128 _ps_cephes_SQRTHF = [0.707106781186547524, 0.707106781186547524, 0.707106781186547524, 0.707106781186547524];
|
|
static immutable __m128 _ps_cephes_log_p0 = [7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2, 7.0376836292E-2];
|
|
static immutable __m128 _ps_cephes_log_p1 = [- 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1, - 1.1514610310E-1];
|
|
static immutable __m128 _ps_cephes_log_p2 = [1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1, 1.1676998740E-1];
|
|
static immutable __m128 _ps_cephes_log_p3 = [- 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1, - 1.2420140846E-1];
|
|
static immutable __m128 _ps_cephes_log_p4 = [+ 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1, + 1.4249322787E-1];
|
|
static immutable __m128 _ps_cephes_log_p5 = [- 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1, - 1.6668057665E-1];
|
|
static immutable __m128 _ps_cephes_log_p6 = [+ 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1, + 2.0000714765E-1];
|
|
static immutable __m128 _ps_cephes_log_p7 = [- 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1, - 2.4999993993E-1];
|
|
static immutable __m128 _ps_cephes_log_p8 = [+ 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1, + 3.3333331174E-1];
|
|
static immutable __m128 _ps_cephes_log_q1 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
|
|
static immutable __m128 _ps_cephes_log_q2 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
|
|
|
|
/* the smallest non denormalized float number */
|
|
static immutable __m128i _psi_min_norm_pos = [0x00800000, 0x00800000, 0x00800000, 0x00800000];
|
|
|
|
__m128i emm0;
|
|
__m128 one = _ps_1;
|
|
__m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
|
|
x = _mm_max_ps(x, cast(__m128)_psi_min_norm_pos); /* cut off denormalized stuff */
|
|
emm0 = _mm_srli_epi32(cast(__m128i)x, 23);
|
|
|
|
/* keep only the fractional part */
|
|
x = _mm_and_ps(x, cast(__m128)_psi_inv_mant_mask);
|
|
x = _mm_or_ps(x, _ps_0p5);
|
|
|
|
emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
|
|
__m128 e = _mm_cvtepi32_ps(emm0);
|
|
e += one;
|
|
__m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
|
|
__m128 tmp = _mm_and_ps(x, mask);
|
|
x -= one;
|
|
e -= _mm_and_ps(one, mask);
|
|
x += tmp;
|
|
__m128 z = x * x;
|
|
__m128 y = _ps_cephes_log_p0;
|
|
y *= x;
|
|
y += _ps_cephes_log_p1;
|
|
y *= x;
|
|
y += _ps_cephes_log_p2;
|
|
y *= x;
|
|
y += _ps_cephes_log_p3;
|
|
y *= x;
|
|
y += _ps_cephes_log_p4;
|
|
y *= x;
|
|
y += _ps_cephes_log_p5;
|
|
y *= x;
|
|
y += _ps_cephes_log_p6;
|
|
y *= x;
|
|
y += _ps_cephes_log_p7;
|
|
y *= x;
|
|
y += _ps_cephes_log_p8;
|
|
y *= x;
|
|
|
|
y = y * z;
|
|
tmp = e * _ps_cephes_log_q1;
|
|
y += tmp;
|
|
tmp = z * _ps_0p5;
|
|
y = y - tmp;
|
|
tmp = e * _ps_cephes_log_q2;
|
|
x += y;
|
|
x += tmp;
|
|
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
|
|
return x;
|
|
}
|
|
|
|
/// Natural `exp` computed for a single float.
|
|
/// This is an approximation, valid up to approximately -109dB of accuracy
|
|
/// IMPORTANT: NaN input not supported.
|
|
// #BONUS
|
|
float _mm_exp_ss(float v) pure @safe
|
|
{
|
|
__m128 r = _mm_exp_ps(_mm_set1_ps(v));
|
|
return r.array[0];
|
|
}
|
|
|
|
/// Natural `exp` computed for 4 simultaneous float in `x`.
|
|
/// This is an approximation, valid up to approximately -109dB of accuracy
|
|
/// IMPORTANT: NaN input not supported.
|
|
// #BONUS
|
|
__m128 _mm_exp_ps(__m128 x) pure @safe
|
|
{
|
|
static immutable __m128 _ps_exp_hi = [88.3762626647949f, 88.3762626647949f, 88.3762626647949f, 88.3762626647949f];
|
|
static immutable __m128 _ps_exp_lo = [-88.3762626647949f, -88.3762626647949f, -88.3762626647949f, -88.3762626647949f];
|
|
static immutable __m128 _ps_cephes_LOG2EF = [1.44269504088896341, 1.44269504088896341, 1.44269504088896341, 1.44269504088896341];
|
|
static immutable __m128 _ps_cephes_exp_C1 = [0.693359375, 0.693359375, 0.693359375, 0.693359375];
|
|
static immutable __m128 _ps_cephes_exp_C2 = [-2.12194440e-4, -2.12194440e-4, -2.12194440e-4, -2.12194440e-4];
|
|
static immutable __m128 _ps_cephes_exp_p0 = [1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4, 1.9875691500E-4];
|
|
static immutable __m128 _ps_cephes_exp_p1 = [1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3, 1.3981999507E-3];
|
|
static immutable __m128 _ps_cephes_exp_p2 = [8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3, 8.3334519073E-3];
|
|
static immutable __m128 _ps_cephes_exp_p3 = [4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2, 4.1665795894E-2];
|
|
static immutable __m128 _ps_cephes_exp_p4 = [1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1, 1.6666665459E-1];
|
|
static immutable __m128 _ps_cephes_exp_p5 = [5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1, 5.0000001201E-1];
|
|
|
|
__m128 tmp = _mm_setzero_ps(), fx;
|
|
__m128i emm0;
|
|
__m128 one = _ps_1;
|
|
|
|
x = _mm_min_ps(x, _ps_exp_hi);
|
|
x = _mm_max_ps(x, _ps_exp_lo);
|
|
|
|
/* express exp(x) as exp(g + n*log(2)) */
|
|
fx = x * _ps_cephes_LOG2EF;
|
|
fx += _ps_0p5;
|
|
|
|
/* how to perform a floorf with SSE: just below */
|
|
emm0 = _mm_cvttps_epi32(fx);
|
|
tmp = _mm_cvtepi32_ps(emm0);
|
|
|
|
/* if greater, substract 1 */
|
|
__m128 mask = _mm_cmpgt_ps(tmp, fx);
|
|
mask = _mm_and_ps(mask, one);
|
|
fx = tmp - mask;
|
|
|
|
tmp = fx * _ps_cephes_exp_C1;
|
|
__m128 z = fx * _ps_cephes_exp_C2;
|
|
x -= tmp;
|
|
x -= z;
|
|
|
|
z = x * x;
|
|
|
|
__m128 y = _ps_cephes_exp_p0;
|
|
y *= x;
|
|
y += _ps_cephes_exp_p1;
|
|
y *= x;
|
|
y += _ps_cephes_exp_p2;
|
|
y *= x;
|
|
y += _ps_cephes_exp_p3;
|
|
y *= x;
|
|
y += _ps_cephes_exp_p4;
|
|
y *= x;
|
|
y += _ps_cephes_exp_p5;
|
|
y *= z;
|
|
y += x;
|
|
y += one;
|
|
|
|
/* build 2^n */
|
|
emm0 = _mm_cvttps_epi32(fx);
|
|
|
|
emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
|
|
emm0 = _mm_slli_epi32(emm0, 23);
|
|
__m128 pow2n = cast(__m128)emm0;
|
|
y *= pow2n;
|
|
return y;
|
|
}
|
|
|
|
/// Computes `base^exponent` for a single 32-bit float.
|
|
/// This is an approximation, valid up to approximately -100dB of accuracy
|
|
/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
|
|
// #BONUS
|
|
float _mm_pow_ss(float base, float exponent) pure @safe
|
|
{
|
|
__m128 r = _mm_pow_ps(_mm_set1_ps(base), _mm_set1_ps(exponent));
|
|
return r.array[0];
|
|
}
|
|
|
|
/// Computes `base^exponent`, for 4 floats at once.
|
|
/// This is an approximation, valid up to approximately -100dB of accuracy
|
|
/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
|
|
// #BONUS
|
|
__m128 _mm_pow_ps(__m128 base, __m128 exponents) pure @safe
|
|
{
|
|
return _mm_exp_ps(exponents * _mm_log_ps(base));
|
|
}
|
|
|
|
/// Computes `base^exponent`, for 4 floats at once.
|
|
/// This is an approximation, valid up to approximately -100dB of accuracy
|
|
/// IMPORTANT: NaN, zero, or infinity input not supported properly. x must be > 0 and finite.
|
|
// #BONUS
|
|
__m128 _mm_pow_ps(__m128 base, float exponent) pure @safe
|
|
{
|
|
return _mm_exp_ps(_mm_set1_ps(exponent) * _mm_log_ps(base));
|
|
}
|
|
|
|
unittest
|
|
{
|
|
import std.math;
|
|
|
|
bool approxEquals(double groundTruth, double approx, double epsilon) pure @trusted @nogc nothrow
|
|
{
|
|
if (!isFinite(groundTruth))
|
|
return true; // no need to approximate where this is NaN or infinite
|
|
|
|
if (groundTruth == 0) // the approximaton should produce zero too if needed
|
|
{
|
|
return approx == 0;
|
|
}
|
|
|
|
if (approx == 0)
|
|
{
|
|
// If the approximation produces zero, the error should be below 140 dB
|
|
return ( abs(groundTruth) < 1e-7 );
|
|
}
|
|
|
|
if ( ( abs(groundTruth / approx) - 1 ) >= epsilon)
|
|
{
|
|
import core.stdc.stdio;
|
|
debug printf("approxEquals (%g, %g, %g) failed\n", groundTruth, approx, epsilon);
|
|
debug printf("ratio is %f\n", abs(groundTruth / approx) - 1);
|
|
}
|
|
|
|
return ( abs(groundTruth / approx) - 1 ) < epsilon;
|
|
}
|
|
|
|
// test _mm_log_ps
|
|
for (double mantissa = 0.1; mantissa < 1.0; mantissa += 0.05)
|
|
{
|
|
foreach (exponent; -23..23)
|
|
{
|
|
double x = mantissa * 2.0 ^^ exponent;
|
|
double phobosValue = log(x);
|
|
__m128 v = _mm_log_ps(_mm_set1_ps(x));
|
|
foreach(i; 0..4)
|
|
assert(approxEquals(phobosValue, v.array[i], 1.1e-6));
|
|
}
|
|
}
|
|
|
|
// test _mm_exp_ps
|
|
for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
|
|
{
|
|
foreach (exponent; -23..23)
|
|
{
|
|
double x = mantissa * 2.0 ^^ exponent;
|
|
|
|
// don't test too high numbers because they saturate FP precision pretty fast
|
|
if (x > 50) continue;
|
|
|
|
double phobosValue = exp(x);
|
|
__m128 v = _mm_exp_ps(_mm_set1_ps(x));
|
|
foreach(i; 0..4)
|
|
{
|
|
if (!approxEquals(phobosValue, v.array[i], 3.4e-6))
|
|
{
|
|
import core.stdc.stdio;
|
|
printf("x = %f truth = %f vs estimate = %fn", x, phobosValue, v.array[i]);
|
|
assert(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// test than exp(-inf) is 0
|
|
{
|
|
__m128 R = _mm_exp_ps(_mm_set1_ps(-float.infinity));
|
|
float[4] correct = [0.0f, 0.0f, 0.0f, 0.0f];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
// test log baheviour with NaN and infinities
|
|
// the only guarantee for now is that _mm_log_ps(negative) yield a NaN
|
|
{
|
|
__m128 R = _mm_log_ps(_mm_setr_ps(+0.0f, -0.0f, -1.0f, float.nan));
|
|
// DOESN'T PASS
|
|
// assert(isInfinity(R[0]) && R[0] < 0); // log(+0.0f) = -infinity
|
|
// DOESN'T PASS
|
|
// assert(isInfinity(R[1]) && R[1] < 0); // log(-0.0f) = -infinity
|
|
assert(isNaN(R.array[2])); // log(negative number) = NaN
|
|
|
|
// DOESN'T PASS
|
|
//assert(isNaN(R[3])); // log(NaN) = NaN
|
|
}
|
|
|
|
|
|
// test _mm_pow_ps
|
|
for (double mantissa = -1.0; mantissa < 1.0; mantissa += 0.1)
|
|
{
|
|
foreach (exponent; -8..4)
|
|
{
|
|
double powExponent = mantissa * 2.0 ^^ exponent;
|
|
|
|
for (double mantissa2 = 0.1; mantissa2 < 1.0; mantissa2 += 0.1)
|
|
{
|
|
foreach (exponent2; -4..4)
|
|
{
|
|
double powBase = mantissa2 * 2.0 ^^ exponent2;
|
|
double phobosValue = pow(powBase, powExponent);
|
|
float fPhobos = phobosValue;
|
|
if (!isFinite(fPhobos)) continue;
|
|
__m128 v = _mm_pow_ps(_mm_set1_ps(powBase), _mm_set1_ps(powExponent));
|
|
|
|
foreach(i; 0..4)
|
|
{
|
|
if (!approxEquals(phobosValue, v.array[i], 1e-5))
|
|
{
|
|
printf("%g ^^ %g\n", powBase, powExponent);
|
|
assert(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
|
|
static immutable __m128 _ps_1 = [1.0f, 1.0f, 1.0f, 1.0f];
|
|
static immutable __m128 _ps_0p5 = [0.5f, 0.5f, 0.5f, 0.5f];
|
|
static immutable __m128i _pi32_0x7f = [0x7f, 0x7f, 0x7f, 0x7f]; |