2216 lines
72 KiB
D
2216 lines
72 KiB
D
/**
|
|
* SSE4.1 intrinsics.
|
|
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
|
|
*
|
|
* Copyright: Guillaume Piolat 2021.
|
|
* Johan Engelen 2021.
|
|
* cet 2024.
|
|
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
|
*/
|
|
module inteli.smmintrin;
|
|
|
|
// SSE4.1 instructions
|
|
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
|
|
// Note: this header will work whether you have SSE4.1 enabled or not.
|
|
// With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
|
|
// generate SSE4.1 instructions.
|
|
// With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
|
|
|
|
public import inteli.types;
|
|
import inteli.internals;
|
|
|
|
// smmintrin pulls in all previous instruction set intrinsics.
|
|
public import inteli.tmmintrin;
|
|
|
|
nothrow @nogc:
|
|
|
|
enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
|
|
enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto
|
|
enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto
|
|
enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto
|
|
enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto
|
|
enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto
|
|
enum int _MM_FROUND_NO_EXC = 0x08; /// ditto
|
|
|
|
enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
|
|
enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
|
|
enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
|
|
enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
|
|
enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
|
|
enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
|
|
|
|
/// Add packed signed 32-bit integers in `a` and `b` using saturation.
|
|
/// #BONUS
|
|
__m128i _mm_adds_epi32(__m128i a, __m128i b) pure
|
|
{
|
|
// PERF: ARM64 should use 2x vqadd_s32
|
|
static if (LDC_with_saturated_intrinsics)
|
|
return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b);
|
|
else
|
|
{
|
|
__m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
|
|
__m128i res = _mm_add_epi32(a, b);
|
|
__m128i sign_bit = _mm_srli_epi32(a, 31);
|
|
__m128i sign_xor = _mm_xor_si128(a, b);
|
|
__m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
|
|
__m128i saturated = _mm_add_epi32(int_max, sign_bit);
|
|
return cast(__m128i) _mm_blendv_ps(cast(__m128)res,
|
|
cast(__m128)saturated,
|
|
cast(__m128)overflow);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i a = _mm_setr_epi32(int.max, 1, 2, int.min);
|
|
__m128i b = _mm_setr_epi32(1, 2, 3, -4);
|
|
assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]);
|
|
}
|
|
|
|
/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
|
|
// Note: changed signature, GDC needs a compile-time value for imm8.
|
|
__m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16
|
|
return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
|
|
}
|
|
else
|
|
{
|
|
// LDC x86 This generates pblendw since LDC 1.1 and -O2
|
|
short8 r;
|
|
short8 sa = cast(short8)a;
|
|
short8 sb = cast(short8)b;
|
|
for (int n = 0; n < 8; ++n)
|
|
{
|
|
r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
|
|
}
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
|
|
__m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
|
|
short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
|
|
short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
|
|
// Note: changed signature, GDC needs a compile-time value for `imm8`.
|
|
__m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
|
|
{
|
|
static assert(imm8 >= 0 && imm8 < 4);
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
|
|
}
|
|
else
|
|
{
|
|
// LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
|
|
double2 r;
|
|
for (int n = 0; n < 2; ++n)
|
|
{
|
|
r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
|
|
}
|
|
return cast(__m128d)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(0, 1);
|
|
__m128d B = _mm_setr_pd(8, 9);
|
|
double2 C = _mm_blend_pd!2(A, B);
|
|
double[2] correct = [0, 9];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control
|
|
/// mask `imm8`.
|
|
// Note: changed signature, GDC needs a compile-time value for imm8.
|
|
__m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static assert(imm8 >= 0 && imm8 < 16);
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_blendps(a, b, imm8);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// LDC x86: generates blendps since LDC 1.1 -O2
|
|
// arm64: pretty good, two instructions worst case
|
|
return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
|
|
(imm8 & 2) ? 5 : 1,
|
|
(imm8 & 4) ? 6 : 2,
|
|
(imm8 & 8) ? 7 : 3)(a, b);
|
|
}
|
|
else
|
|
{
|
|
// PERF GDC without SSE4.1 is quite bad
|
|
__m128 r;
|
|
for (int n = 0; n < 4; ++n)
|
|
{
|
|
r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
|
|
}
|
|
return r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(0, 1, 2, 3);
|
|
__m128 B = _mm_setr_ps(8, 9, 10, 11);
|
|
float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
|
|
float[4] correct = [8, 1, 10, 11];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Blend packed 8-bit integers from `a` and `b` using `mask`.
|
|
/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
|
|
__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
/*static if (GDC_with_SSE41)
|
|
{
|
|
// This intrinsic do nothing in GDC 12.
|
|
// TODO report to GDC. No problem in GCC.
|
|
return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
|
|
}
|
|
else*/
|
|
static if (LDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64: two instructions since LDC 1.12 -O2
|
|
byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
|
|
return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
|
|
}
|
|
else
|
|
{
|
|
__m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
|
|
return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7,
|
|
8, 9, 10, 11, 12, 13, 14, 15);
|
|
__m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23,
|
|
24, 25, 26, 27, 28, 29, 30, 31);
|
|
__m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127,
|
|
1, 1, -1, -1, 4, 1, 8, -128);
|
|
byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
|
|
byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7,
|
|
8, 9, 26, 27, 12, 13, 14, 31 ];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
|
|
/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
|
|
__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE42)
|
|
{
|
|
// PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
|
|
// with -msse4.2 but not -msse4.1.
|
|
// Not sure what is the reason, and there is a replacement sequence.
|
|
// Sounds like a bug.
|
|
return __builtin_ia32_blendvpd(a, b, mask);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_blendvpd(a, b, mask);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
long2 shift;
|
|
shift = 63;
|
|
long2 lmask = cast(long2)mask >> shift;
|
|
return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
|
|
}
|
|
else
|
|
{
|
|
__m128d r; // PERF =void;
|
|
long2 lmask = cast(long2)mask;
|
|
for (int n = 0; n < 2; ++n)
|
|
{
|
|
r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
|
|
}
|
|
return r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(1.0, 2.0);
|
|
__m128d B = _mm_setr_pd(3.0, 4.0);
|
|
__m128d M1 = _mm_setr_pd(-3.0, 2.0);
|
|
__m128d R1 = _mm_blendv_pd(A, B, M1);
|
|
double[2] correct1 = [3.0, 2.0];
|
|
assert(R1.array == correct1);
|
|
|
|
// Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
|
|
// See Issue #78
|
|
__m128d M2 = _mm_setr_pd(double.nan, double.infinity);
|
|
__m128d R2 = _mm_blendv_pd(A, B, M2);
|
|
double[2] correct2 = [1.0, 2.0];
|
|
assert(R2.array == correct2);
|
|
}
|
|
|
|
|
|
/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
|
|
__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_blendvps(a, b, mask);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_blendvps(a, b, mask);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
int4 shift;
|
|
shift = 31;
|
|
int4 lmask = cast(int4)mask >> shift;
|
|
return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
|
|
}
|
|
else
|
|
{
|
|
// LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2
|
|
// If lack of optimization, consider replacing by:
|
|
// __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
|
|
// return _mm_or_si128(
|
|
// _mm_and_si128(overflow_mask, saturated),
|
|
// _mm_andnot_si128(overflow_mask, res)
|
|
// LLVM makes almost the same sequence when optimized.
|
|
__m128 r;
|
|
int4 lmask = cast(int4)mask;
|
|
for (int n = 0; n < 4; ++n)
|
|
{
|
|
r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
|
|
}
|
|
return r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
|
|
__m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
|
|
__m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
|
|
__m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
|
|
__m128 R1 = _mm_blendv_ps(A, B, M1);
|
|
__m128 R2 = _mm_blendv_ps(A, B, M2);
|
|
float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f];
|
|
float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f];
|
|
assert(R1.array == correct1);
|
|
|
|
// Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
|
|
// See Issue #78
|
|
assert(R2.array == correct2);
|
|
}
|
|
|
|
/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value,
|
|
/// and store the results as packed double-precision floating-point elements.
|
|
__m128d _mm_ceil_pd (__m128d a) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64 acceptable since 1.8 -O2
|
|
// Unfortunately x86 intrinsics force a round-trip back to double2
|
|
// ARM neon semantics wouldn't have that
|
|
long2 l = vcvtpq_s64_f64(a);
|
|
double2 r;
|
|
r.ptr[0] = l.array[0];
|
|
r.ptr[1] = l.array[1];
|
|
return r;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_pd!2(a);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(1.3f, -2.12f);
|
|
__m128d B = _mm_setr_pd(53.6f, -2.7f);
|
|
A = _mm_ceil_pd(A);
|
|
B = _mm_ceil_pd(B);
|
|
double[2] correctA = [2.0, -2.0];
|
|
double[2] correctB = [54.0, -2.0];
|
|
assert(A.array == correctA);
|
|
assert(B.array == correctB);
|
|
}
|
|
|
|
/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value,
|
|
/// and store the results as packed single-precision floating-point elements.
|
|
__m128 _mm_ceil_ps (__m128 a) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64 acceptable since 1.8 -O1
|
|
int4 l = vcvtpq_s32_f32(a);
|
|
float4 r;
|
|
r.ptr[0] = l.array[0];
|
|
r.ptr[1] = l.array[1];
|
|
r.ptr[2] = l.array[2];
|
|
r.ptr[3] = l.array[3];
|
|
return r;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_ps!2(a);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
|
|
__m128 C = _mm_ceil_ps(A);
|
|
float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value,
|
|
/// store the result as a double-precision floating-point element in the lower element of result,
|
|
/// and copy the upper element from `a` to the upper element of dst.
|
|
__m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
a[0] = vcvtps_s64_f64(b[0]);
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_sd!2(a, b);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(1.3, -2.12);
|
|
__m128d B = _mm_setr_pd(53.6, -3.7);
|
|
__m128d C = _mm_ceil_sd(A, B);
|
|
double[2] correct = [54.0, -2.12];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
|
|
/// store the result as a single-precision floating-point element in the lower element of result,
|
|
/// and copy the upper 3 packed elements from `a` to the upper elements of result.
|
|
__m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
a[0] = vcvtps_s32_f32(b[0]);
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_ss!2(a, b);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
|
|
__m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
|
|
__m128 C = _mm_ceil_ss(A, B);
|
|
float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Compare packed 64-bit integers in `a` and `b` for equality.
|
|
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
|
|
{
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
{
|
|
version(DigitalMars)
|
|
{
|
|
// DMD doesn't recognize long2 == long2
|
|
long2 la = cast(long2)a;
|
|
long2 lb = cast(long2)b;
|
|
long2 res;
|
|
res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
|
|
res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
|
|
return cast(__m128i)res;
|
|
}
|
|
else
|
|
{
|
|
return cast(__m128i)(cast(long2)a == cast(long2)b);
|
|
}
|
|
}
|
|
else static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// LDC x86: generates pcmpeqq since LDC 1.1 -O1
|
|
// arm64: generates cmeq since LDC 1.8 -O1
|
|
return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
|
|
}
|
|
else
|
|
{
|
|
// Clever pcmpeqd + pand use with LDC 1.24 -O2
|
|
long2 la = cast(long2)a;
|
|
long2 lb = cast(long2)b;
|
|
long2 res;
|
|
res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
|
|
res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
|
|
return cast(__m128i)res;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi64(-1, -2);
|
|
__m128i B = _mm_setr_epi64(-3, -2);
|
|
__m128i C = _mm_setr_epi64(-1, -4);
|
|
long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
|
|
long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
|
|
long[2] correct1 = [0, -1];
|
|
long[2] correct2 = [-1, 0];
|
|
assert(AB.array == correct1);
|
|
assert(AC.array == correct2);
|
|
}
|
|
|
|
|
|
/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
|
|
__m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
|
|
}
|
|
else static if (LDC_with_optimizations)
|
|
{
|
|
// LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
|
|
enum ir = `
|
|
%v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
|
|
%r = sext <4 x i16> %v to <4 x i32>
|
|
ret <4 x i32> %r`;
|
|
return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
|
|
}
|
|
else
|
|
{
|
|
short8 sa = cast(short8)a;
|
|
int4 r;
|
|
r.ptr[0] = sa.array[0];
|
|
r.ptr[1] = sa.array[1];
|
|
r.ptr[2] = sa.array[2];
|
|
r.ptr[3] = sa.array[3];
|
|
return r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
|
|
int4 C = cast(int4) _mm_cvtepi16_epi32(A);
|
|
int[4] correct = [-1, 0, -32768, 32767];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
|
|
__m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
|
|
}
|
|
else static if (LDC_with_optimizations)
|
|
{
|
|
// LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
|
|
enum ir = `
|
|
%v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
|
|
%r = sext <2 x i16> %v to <2 x i64>
|
|
ret <2 x i64> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
|
|
}
|
|
else
|
|
{
|
|
short8 sa = cast(short8)a;
|
|
long2 r;
|
|
r.ptr[0] = sa.array[0];
|
|
r.ptr[1] = sa.array[1];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
|
|
long2 C = cast(long2) _mm_cvtepi16_epi64(A);
|
|
long[2] correct = [-32768, 32767];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
|
|
__m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
|
|
}
|
|
else static if (LDC_with_optimizations)
|
|
{
|
|
// LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
|
|
enum ir = `
|
|
%v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
|
|
%r = sext <2 x i32> %v to <2 x i64>
|
|
ret <2 x i64> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
|
|
}
|
|
else
|
|
{
|
|
int4 sa = cast(int4)a;
|
|
long2 r;
|
|
r.ptr[0] = sa.array[0];
|
|
r.ptr[1] = sa.array[1];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(-4, 42, 0, 0);
|
|
long2 C = cast(long2) _mm_cvtepi32_epi64(A);
|
|
long[2] correct = [-4, 42];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
|
|
__m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
alias ubyte16 = __vector(ubyte[16]);
|
|
return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
|
|
}
|
|
else static if (LDC_with_optimizations)
|
|
{
|
|
// LDC x86: pmovsxbw generated since LDC 1.1.0 -O0
|
|
// LDC ARM64: sshll generated since LDC 1.8.0 -O1
|
|
enum ir = `
|
|
%v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
%r = sext <8 x i8> %v to <8 x i16>
|
|
ret <8 x i16> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
|
|
}
|
|
else
|
|
{
|
|
byte16 sa = cast(byte16)a;
|
|
short8 r;
|
|
foreach(n; 0..8)
|
|
r.ptr[n] = sa.array[n];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
short8 C = cast(short8) _mm_cvtepi8_epi16(A);
|
|
short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
|
|
__m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
alias ubyte16 = __vector(ubyte[16]);
|
|
return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
|
|
}
|
|
else static if (LDC_with_SSE41 && LDC_with_optimizations)
|
|
{
|
|
// LDC x86: Generates pmovsxbd since LDC 1.1 -O0
|
|
enum ir = `
|
|
%v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
%r = sext <4 x i8> %v to <4 x i32>
|
|
ret <4 x i32> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
|
|
}
|
|
else
|
|
{
|
|
// LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
|
|
byte16 sa = cast(byte16)a;
|
|
int4 r;
|
|
r.ptr[0] = sa.array[0];
|
|
r.ptr[1] = sa.array[1];
|
|
r.ptr[2] = sa.array[2];
|
|
r.ptr[3] = sa.array[3];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
int4 C = cast(int4) _mm_cvtepi8_epi32(A);
|
|
int[4] correct = [127, -128, 1, -1];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
|
|
__m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
alias ubyte16 = __vector(ubyte[16]);
|
|
return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
|
|
}
|
|
else static if (LDC_with_optimizations)
|
|
{
|
|
// LDC x86: Generates pmovsxbq since LDC 1.1 -O0,
|
|
// LDC arm64: it's ok since LDC 1.8 -O1
|
|
enum ir = `
|
|
%v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
|
|
%r = sext <2 x i8> %v to <2 x i64>
|
|
ret <2 x i64> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
|
|
}
|
|
else
|
|
{
|
|
byte16 sa = cast(byte16)a;
|
|
long2 r;
|
|
foreach(n; 0..2)
|
|
r.ptr[n] = sa.array[n];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
long2 C = cast(long2) _mm_cvtepi8_epi64(A);
|
|
long[2] correct = [127, -128];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
|
|
__m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
|
|
}
|
|
else
|
|
{
|
|
// LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
|
|
// arm64: ushll since LDC 1.12 -O1
|
|
short8 sa = cast(short8)a;
|
|
int4 r;
|
|
r.ptr[0] = cast(ushort)sa.array[0];
|
|
r.ptr[1] = cast(ushort)sa.array[1];
|
|
r.ptr[2] = cast(ushort)sa.array[2];
|
|
r.ptr[3] = cast(ushort)sa.array[3];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
|
|
int4 C = cast(int4) _mm_cvtepu16_epi32(A);
|
|
int[4] correct = [65535, 0, 32768, 32767];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
|
|
__m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64: a bit shorter than below, in -O2
|
|
short8 sa = cast(short8)a;
|
|
long2 r;
|
|
for(int n = 0; n < 2; ++n)
|
|
r.ptr[n] = cast(ushort)sa.array[n];
|
|
return cast(__m128i)r;
|
|
}
|
|
else
|
|
{
|
|
// LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
|
|
short8 sa = cast(short8)a;
|
|
long2 r;
|
|
r.ptr[0] = cast(ushort)sa.array[0];
|
|
r.ptr[1] = cast(ushort)sa.array[1];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
|
|
long2 C = cast(long2) _mm_cvtepu16_epi64(A);
|
|
long[2] correct = [65535, 0];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
|
|
__m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
|
|
}
|
|
else
|
|
{
|
|
// LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
|
|
// arm64: generates ushll since LDC 1.12 -O1
|
|
int4 sa = cast(int4)a;
|
|
long2 r;
|
|
r.ptr[0] = cast(uint)sa.array[0];
|
|
r.ptr[1] = cast(uint)sa.array[1];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(-1, 42, 0, 0);
|
|
long2 C = cast(long2) _mm_cvtepu32_epi64(A);
|
|
long[2] correct = [4294967295, 42];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
|
|
__m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
|
|
}
|
|
else static if (LDC_with_optimizations)
|
|
{
|
|
enum ir = `
|
|
%v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
%r = zext <8 x i8> %v to <8 x i16>
|
|
ret <8 x i16> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
|
|
}
|
|
else
|
|
{
|
|
return _mm_unpacklo_epi8(a, _mm_setzero_si128());
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
short8 C = cast(short8) _mm_cvtepu8_epi16(A);
|
|
short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
|
|
__m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
alias ubyte16 = __vector(ubyte[16]);
|
|
return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64: a bit better than below in -O2
|
|
byte16 sa = cast(byte16)a;
|
|
int4 r;
|
|
for(int n = 0; n < 4; ++n)
|
|
r.ptr[n] = cast(ubyte)sa.array[n];
|
|
return cast(__m128i)r;
|
|
}
|
|
else
|
|
{
|
|
// LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
|
|
// PERF: catastrophic with GDC without SSE4.1
|
|
byte16 sa = cast(byte16)a;
|
|
int4 r;
|
|
r.ptr[0] = cast(ubyte)sa.array[0];
|
|
r.ptr[1] = cast(ubyte)sa.array[1];
|
|
r.ptr[2] = cast(ubyte)sa.array[2];
|
|
r.ptr[3] = cast(ubyte)sa.array[3];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
int4 C = cast(int4) _mm_cvtepu8_epi32(A);
|
|
int[4] correct = [127, 128, 1, 255];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
|
|
__m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
alias ubyte16 = __vector(ubyte[16]);
|
|
return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64: this optimizes better than the loop below
|
|
byte16 sa = cast(byte16)a;
|
|
long2 r;
|
|
for (int n = 0; n < 2; ++n)
|
|
r.ptr[n] = cast(ubyte)sa.array[n];
|
|
return cast(__m128i)r;
|
|
}
|
|
else
|
|
{
|
|
// LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
|
|
byte16 sa = cast(byte16)a;
|
|
long2 r;
|
|
r.ptr[0] = cast(ubyte)sa.array[0];
|
|
r.ptr[1] = cast(ubyte)sa.array[1];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
long2 C = cast(long2) _mm_cvtepu8_epi64(A);
|
|
long[2] correct = [127, 254];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Conditionally multiply the packed double-precision (64-bit) floating-point elements
|
|
/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
|
|
/// store the sum in dst using the low 4 bits of `imm8`.
|
|
__m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_dppd(a, b, imm8 & 0x33);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_dppd(a, b, imm8 & 0x33);
|
|
}
|
|
else
|
|
{
|
|
__m128d zero = _mm_setzero_pd();
|
|
__m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
|
|
double sum = temp.array[0] + temp.array[1];
|
|
return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(1.0, 2.0);
|
|
__m128d B = _mm_setr_pd(4.0, 8.0);
|
|
double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
|
|
double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
|
|
double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
|
|
double[2] correct1 = [ 4.0, 4.0];
|
|
double[2] correct2 = [16.0, 0.0];
|
|
double[2] correct3 = [ 0.0, 20.0];
|
|
assert(R1.array == correct1);
|
|
assert(R2.array == correct2);
|
|
assert(R3.array == correct3);
|
|
}
|
|
|
|
/// Conditionally multiply the packed single-precision (32-bit) floating-point elements
|
|
/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products,
|
|
/// and conditionally store the sum in result using the low 4 bits of `imm8`.
|
|
__m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_dpps(a, b, cast(byte)imm8);
|
|
}
|
|
else
|
|
{
|
|
__m128 zero = _mm_setzero_ps();
|
|
__m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
|
|
float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
|
|
return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
|
|
__m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
|
|
float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
|
|
float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
|
|
float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
|
|
float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f];
|
|
float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f];
|
|
float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f];
|
|
assert(R1.array == correct1);
|
|
assert(R2.array == correct2);
|
|
assert(R3.array == correct3);
|
|
}
|
|
|
|
|
|
/// Extract a 32-bit integer from `a`, selected with `imm8`.
|
|
int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
|
|
{
|
|
return (cast(int4)a).array[imm8 & 3];
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(1, 2, 3, 4);
|
|
assert(_mm_extract_epi32(A, 0) == 1);
|
|
assert(_mm_extract_epi32(A, 1 + 8) == 2);
|
|
assert(_mm_extract_epi32(A, 3 + 4) == 4);
|
|
}
|
|
|
|
/// Extract a 64-bit integer from `a`, selected with `imm8`.
|
|
long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
|
|
{
|
|
long2 la = cast(long2)a;
|
|
return la.array[imm8 & 1];
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi64(45, -67);
|
|
assert(_mm_extract_epi64(A, 0) == 45);
|
|
assert(_mm_extract_epi64(A, 1) == -67);
|
|
assert(_mm_extract_epi64(A, 2) == 45);
|
|
}
|
|
|
|
/// Extract an 8-bit integer from `a`, selected with `imm8`.
|
|
/// Warning: the returned value is zero-extended to 32-bits.
|
|
int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
|
|
{
|
|
byte16 ba = cast(byte16)a;
|
|
return cast(ubyte) ba.array[imm8 & 15];
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
|
|
assert(_mm_extract_epi8(A, 7) == 7);
|
|
assert(_mm_extract_epi8(A, 13) == 255);
|
|
assert(_mm_extract_epi8(A, 7 + 16) == 7);
|
|
}
|
|
|
|
/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
|
|
/// Note: returns a 32-bit $(I integer).
|
|
int _mm_extract_ps (__m128 a, const int imm8) @trusted
|
|
{
|
|
return (cast(int4)a).array[imm8 & 3];
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
|
|
assert(_mm_extract_ps(A, 0) == 0x3f800000);
|
|
assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
|
|
assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
|
|
}
|
|
|
|
|
|
|
|
/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an
|
|
/// integer value, and store the results as packed double-precision floating-point elements.
|
|
__m128d _mm_floor_pd (__m128d a) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64 acceptable since 1.8 -O2
|
|
long2 l = vcvtmq_s64_f64(a);
|
|
double2 r;
|
|
r.ptr[0] = l.array[0];
|
|
r.ptr[1] = l.array[1];
|
|
return r;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_pd!1(a);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(1.3f, -2.12f);
|
|
__m128d B = _mm_setr_pd(53.6f, -2.7f);
|
|
A = _mm_floor_pd(A);
|
|
B = _mm_floor_pd(B);
|
|
double[2] correctA = [1.0, -3.0];
|
|
double[2] correctB = [53.0, -3.0];
|
|
assert(A.array == correctA);
|
|
assert(B.array == correctB);
|
|
}
|
|
|
|
/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an
|
|
/// integer value, and store the results as packed single-precision floating-point elements.
|
|
__m128 _mm_floor_ps (__m128 a) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
// LDC arm64 acceptable since 1.8 -O1
|
|
int4 l = vcvtmq_s32_f32(a);
|
|
float4 r;
|
|
r.ptr[0] = l.array[0];
|
|
r.ptr[1] = l.array[1];
|
|
r.ptr[2] = l.array[2];
|
|
r.ptr[3] = l.array[3];
|
|
return r;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_ps!1(a);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
|
|
__m128 C = _mm_floor_ps(A);
|
|
float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Round the lower double-precision (64-bit) floating-point element in `b` down to an
|
|
/// integer value, store the result as a double-precision floating-point element in the
|
|
/// lower element, and copy the upper element from `a` to the upper element.
|
|
__m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
a[0] = vcvtms_s64_f64(b[0]);
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_sd!1(a, b);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128d A = _mm_setr_pd(1.3, -2.12);
|
|
__m128d B = _mm_setr_pd(-53.1, -3.7);
|
|
__m128d C = _mm_floor_sd(A, B);
|
|
double[2] correct = [-54.0, -2.12];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Round the lower single-precision (32-bit) floating-point element in `b` down to an
|
|
/// integer value, store the result as a single-precision floating-point element in the
|
|
/// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
|
|
__m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
|
|
{
|
|
static if (LDC_with_ARM64)
|
|
{
|
|
a[0] = vcvtms_s32_f32(b[0]);
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
return _mm_round_ss!1(a, b);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
|
|
__m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
|
|
__m128 C = _mm_floor_ss(A, B);
|
|
float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
/// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
|
|
__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
|
|
{
|
|
// GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
|
|
// LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
|
|
// LDC arm64: ins.s since LDC 1.8 -O2
|
|
int4 ia = cast(int4)a;
|
|
ia.ptr[imm8 & 3] = i;
|
|
return cast(__m128i)ia;
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(1, 2, 3, 4);
|
|
int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
|
|
int[4] result = [1, 2, 5, 4];
|
|
assert(C.array == result);
|
|
}
|
|
|
|
/// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
|
|
__m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
|
|
{
|
|
// GDC: nothing special to do, psinrq generated with -O1 -msse4.1
|
|
// LDC x86: always do something sensible.
|
|
long2 la = cast(long2)a;
|
|
la.ptr[imm8 & 1] = i;
|
|
return cast(__m128i)la;
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi64(1, 2);
|
|
long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
|
|
long[2] result = [1, 5];
|
|
assert(C.array == result);
|
|
}
|
|
|
|
/// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
|
|
/// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
|
|
__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
|
|
{
|
|
// GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
|
|
// LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
|
|
byte16 ba = cast(byte16)a;
|
|
ba.ptr[imm8 & 15] = cast(byte)i;
|
|
return cast(__m128i)ba;
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
|
|
byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
|
|
assert(C.array == result);
|
|
}
|
|
|
|
|
|
/// Warning: of course it does something totally different from `_mm_insert_epi32`!
|
|
/// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b`
|
|
/// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]`
|
|
/// (elements are zeroed out when the corresponding bit is set).
|
|
__m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
|
|
}
|
|
else
|
|
{
|
|
float4 tmp2 = a;
|
|
float tmp1 = b.array[(imm8 >> 6) & 3];
|
|
tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
|
|
return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
|
|
__m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
|
|
__m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
|
|
float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f];
|
|
assert(C.array == correct);
|
|
}
|
|
|
|
|
|
/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
|
|
__m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pmaxsd since LDC 1.1 -O1
|
|
// ARM: smax.4s since LDC 1.8 -01
|
|
int4 sa = cast(int4)a;
|
|
int4 sb = cast(int4)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
int4 greater = sa > sb;
|
|
else
|
|
int4 greater = greaterMask!int4(sa, sb);
|
|
return cast(__m128i)( (greater & sa) | (~greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
__m128i higher = _mm_cmpgt_epi32(a, b);
|
|
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
|
|
__m128i mask = _mm_and_si128(aTob, higher);
|
|
return _mm_xor_si128(b, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
|
|
_mm_setr_epi32( -4,-8, 9, -8));
|
|
int[4] correct = [0x7fffffff, 1, 9, 7];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed signed 8-bit integers in `a` and `b`,
|
|
/// and return packed maximum values.
|
|
__m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pmaxsb since LDC 1.1 -O1
|
|
// ARM64: smax.16b since LDC 1.8.0 -O1
|
|
byte16 sa = cast(byte16)a;
|
|
byte16 sb = cast(byte16)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
byte16 greater = sa > sb;
|
|
else
|
|
byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
|
|
return cast(__m128i)( (greater & sa) | (~greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
__m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
|
|
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
|
|
__m128i mask = _mm_and_si128(aTob, lower);
|
|
return _mm_xor_si128(b, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
__m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
byte16 R = cast(byte16) _mm_max_epi8(A, B);
|
|
byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
|
|
__m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pmaxuw since LDC 1.1 -O1
|
|
// ARM64: umax.8h since LDC 1.8.0 -O1
|
|
// PERF: without sse4.1, LLVM 12 produces a very interesting
|
|
// psubusw xmm0, xmm1
|
|
// paddw xmm0, xmm1
|
|
// sequence that maybe should go in other min/max intrinsics?
|
|
ushort8 sa = cast(ushort8)a;
|
|
ushort8 sb = cast(ushort8)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
{
|
|
// Note: doesn't work well with GDC, which prefers the builtin.
|
|
ushort8 greater = sa > sb;
|
|
}
|
|
else
|
|
ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
|
|
return cast(__m128i)( (greater & sa) | (~greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
b = _mm_subs_epu16(b, a);
|
|
b = _mm_add_epi16(b, a);
|
|
return b;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57),
|
|
_mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0));
|
|
short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
|
|
__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
|
|
// ARM64: umax.4s since LDC 1.8.0 -O1
|
|
uint4 sa = cast(uint4)a;
|
|
uint4 sb = cast(uint4)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
uint4 greater = sa > sb;
|
|
else
|
|
uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
|
|
return cast(__m128i)( (greater & sa) | (~greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
// PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
|
|
/+
|
|
movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
|
|
movdqa xmm3, xmm1
|
|
pxor xmm3, xmm2
|
|
pxor xmm2, xmm0
|
|
pcmpgtd xmm2, xmm3
|
|
pand xmm0, xmm2
|
|
pandn xmm2, xmm1
|
|
por xmm0, xmm2
|
|
+/
|
|
__m128i valueShift = _mm_set1_epi32(-0x80000000);
|
|
__m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
|
|
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
|
|
__m128i mask = _mm_and_si128(aTob, higher);
|
|
return _mm_xor_si128(b, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7),
|
|
_mm_setr_epi32( -4,-8, 9, -8));
|
|
int[4] correct = [ -4,-8, 9, -7];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
|
|
__m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pminsd since LDC 1.1 -O1, also good without sse4.1
|
|
// ARM: smin.4s since LDC 1.8 -01
|
|
int4 sa = cast(int4)a;
|
|
int4 sb = cast(int4)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
int4 greater = sa > sb;
|
|
else
|
|
int4 greater = greaterMask!int4(sa, sb);
|
|
return cast(__m128i)( (~greater & sa) | (greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
__m128i higher = _mm_cmplt_epi32(a, b);
|
|
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
|
|
__m128i mask = _mm_and_si128(aTob, higher);
|
|
return _mm_xor_si128(b, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
|
|
_mm_setr_epi32( -4, -8, 9, -8));
|
|
int[4] correct = [ -4, -8, -4, -8];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed signed 8-bit integers in `a` and `b`,
|
|
/// and return packed minimum values.
|
|
__m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pminsb since LDC 1.1 -O1
|
|
// ARM64: smin.16b since LDC 1.8.0 -O1
|
|
byte16 sa = cast(byte16)a;
|
|
byte16 sb = cast(byte16)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
byte16 greater = sa > sb;
|
|
else
|
|
byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
|
|
return cast(__m128i)( (~greater & sa) | (greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
__m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
|
|
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
|
|
__m128i mask = _mm_and_si128(aTob, lower);
|
|
return _mm_xor_si128(b, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
__m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
byte16 R = cast(byte16) _mm_min_epi8(A, B);
|
|
byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
|
|
__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
|
|
// ARM64: umin.8h since LDC 1.8.0 -O1
|
|
ushort8 sa = cast(ushort8)a;
|
|
ushort8 sb = cast(ushort8)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
ushort8 greater = (sb > sa);
|
|
else
|
|
ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
|
|
return cast(__m128i)( (greater & sa) | (~greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
__m128i c = _mm_subs_epu16(b, a);
|
|
b = _mm_sub_epi16(b, c);
|
|
return b;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57),
|
|
_mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0));
|
|
short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
|
|
__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
// x86: pminud since LDC 1.1 -O1, also good without sse4.1
|
|
// ARM64: umin.4s since LDC 1.8.0 -O1
|
|
uint4 sa = cast(uint4)a;
|
|
uint4 sb = cast(uint4)b;
|
|
static if (SIMD_COMPARISON_MASKS_16B)
|
|
uint4 greater = sa > sb;
|
|
else
|
|
uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
|
|
return cast(__m128i)( (~greater & sa) | (greater & sb) );
|
|
}
|
|
else
|
|
{
|
|
// PERF: same remark as in _mm_max_epu32
|
|
__m128i valueShift = _mm_set1_epi32(-0x80000000);
|
|
__m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
|
|
__m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
|
|
__m128i mask = _mm_and_si128(aTob, higher);
|
|
return _mm_xor_si128(b, mask);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7),
|
|
_mm_setr_epi32( -4,-8, 9, -8));
|
|
int[4] correct = [0x7fffffff, 1, 4, -8];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`,
|
|
/// store the minimum and index in return value, and zero the remaining bits.
|
|
__m128i _mm_minpos_epu16 (__m128i a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
__m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
|
|
__m128i combinedLo = _mm_unpacklo_epi16(indices, a);
|
|
__m128i combinedHi = _mm_unpackhi_epi16(indices, a);
|
|
__m128i best = _mm_min_epu32(combinedLo, combinedHi);
|
|
best = _mm_min_epu32(best, _mm_srli_si128!8(best));
|
|
best = _mm_min_epu32(best, _mm_srli_si128!4(best));
|
|
short8 sbest = cast(short8)best;
|
|
short8 r;
|
|
r[0] = sbest[1];
|
|
r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
|
|
r[2] = 0;
|
|
r[3] = 0;
|
|
r[4] = 0;
|
|
r[5] = 0;
|
|
r[6] = 0;
|
|
r[7] = 0;
|
|
return cast(__m128i)r;
|
|
}
|
|
else
|
|
{
|
|
short8 sa = cast(short8)a;
|
|
ushort min = 0xffff;
|
|
int index = 0;
|
|
for(int n = 0; n < 8; ++n)
|
|
{
|
|
ushort c = sa.array[n];
|
|
if (c < min)
|
|
{
|
|
min = c;
|
|
index = n;
|
|
}
|
|
}
|
|
short8 r;
|
|
r.ptr[0] = min;
|
|
r.ptr[1] = cast(short)index;
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
|
|
__m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6);
|
|
short8 R1 = cast(short8) _mm_minpos_epu16(A);
|
|
short8 R2 = cast(short8) _mm_minpos_epu16(B);
|
|
short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
|
|
short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
|
|
assert(R1.array == correct1);
|
|
assert(R2.array == correct2);
|
|
}
|
|
|
|
/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers
|
|
/// in `a` compared to those in `b`, and store the 16-bit results in dst.
|
|
/// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`.
|
|
/// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`.
|
|
/// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting
|
|
/// at the offset specified in `imm8[2]`.
|
|
__m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
|
|
}
|
|
else
|
|
{
|
|
int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
|
|
int b_offset = (imm8 & 3) * 4;
|
|
|
|
byte16 ba = cast(byte16)a;
|
|
byte16 bb = cast(byte16)b;
|
|
short8 r;
|
|
|
|
__m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
|
|
|
|
for (int j = 0; j < 8; j += 2)
|
|
{
|
|
int k = a_offset + j;
|
|
__m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
|
|
0, 0, 0, 0,
|
|
ba[k+1], ba[k+2], ba[k+3], ba[k+4],
|
|
0, 0, 0, 0);
|
|
short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
|
|
r.ptr[j] = diffs.array[0];
|
|
r.ptr[j+1] = diffs.array[4];
|
|
}
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
__m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15);
|
|
short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
|
|
short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
|
|
short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
|
|
short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
|
|
short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
|
|
short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
|
|
short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
|
|
short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
|
|
short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
|
|
short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
|
|
assert(r1.array == correct1);
|
|
assert(r4.array == correct4);
|
|
assert(r5.array == correct5);
|
|
assert(r7.array == correct7);
|
|
assert(r8.array == correct0);
|
|
}
|
|
|
|
/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
|
|
__m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
|
|
}
|
|
else static if (LDC_with_SSE41 && LDC_with_optimizations)
|
|
{
|
|
// For some reason, clang has the builtin but it's not in IntrinsicsX86.td
|
|
// Use IR instead.
|
|
// This generates pmuldq with since LDC 1.2.0 -O0
|
|
enum ir = `
|
|
%ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
|
|
%ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
|
|
%la = sext <2 x i32> %ia to <2 x i64>
|
|
%lb = sext <2 x i32> %ib to <2 x i64>
|
|
%r = mul <2 x i64> %la, %lb
|
|
ret <2 x i64> %r`;
|
|
return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// 3 instructions since LDC 1.8 -O2
|
|
// But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
|
|
int2 a_lo = vmovn_s64(cast(long2)a);
|
|
int2 b_lo = vmovn_s64(cast(long2)b);
|
|
return cast(__m128i) vmull_s32(a_lo, b_lo);
|
|
}
|
|
else
|
|
{
|
|
int4 ia = cast(int4)a;
|
|
int4 ib = cast(int4)b;
|
|
long2 r;
|
|
r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
|
|
r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
|
|
return cast(__m128i)r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
|
|
__m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
|
|
long2 R = cast(long2) _mm_mul_epi32(A, B);
|
|
long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers,
|
|
/// return the low 32 bits of the intermediate integers.
|
|
__m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
// PERF GDC without SSE4.1 could be better
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
int4 ia = cast(int4)a;
|
|
int4 ib = cast(int4)b;
|
|
// Note: older GDC doesn't have that op, but older GDC
|
|
// also has no support for -msse4.1 detection
|
|
return cast(__m128i)(a * b);
|
|
}
|
|
else version(LDC)
|
|
{
|
|
int4 ia = cast(int4)a;
|
|
int4 ib = cast(int4)b;
|
|
return cast(__m128i)(a * b);
|
|
}
|
|
else
|
|
{
|
|
// DMD doesn't take the above
|
|
int4 ia = cast(int4)a;
|
|
int4 ib = cast(int4)b;
|
|
int4 r;
|
|
r.ptr[0] = ia.array[0] * ib.array[0];
|
|
r.ptr[1] = ia.array[1] * ib.array[1];
|
|
r.ptr[2] = ia.array[2] * ib.array[2];
|
|
r.ptr[3] = ia.array[3] * ib.array[3];
|
|
return r;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
|
|
__m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
|
|
int4 R = cast(int4) _mm_mullo_epi32(A, B);
|
|
int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
|
|
/// Convert packed signed 32-bit integers from `a` and `b`
|
|
/// to packed 16-bit integers using unsigned saturation.
|
|
__m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
int4 z;
|
|
z = 0;
|
|
return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
|
|
vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
|
|
}
|
|
else
|
|
{
|
|
__m128i i32768 = _mm_set1_epi32(32768);
|
|
__m128i s32768 = _mm_set1_epi16(-32768);
|
|
a = _mm_sub_epi32(a, i32768);
|
|
b = _mm_sub_epi32(b, i32768);
|
|
__m128i clampedSigned = _mm_packs_epi32(a, b);
|
|
return _mm_add_epi16(clampedSigned, s32768);
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
|
|
short8 R = cast(short8) _mm_packus_epi32(A, A);
|
|
short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
|
|
assert(R.array == correct);
|
|
}
|
|
|
|
|
|
/// Round the packed double-precision (64-bit) floating-point elements in `a` using the
|
|
/// rounding parameter, and store the results as packed double-precision floating-point elements.
|
|
/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
|
|
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
|
|
/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
|
|
/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
|
|
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
|
|
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
|
|
__m128d _mm_round_pd(int rounding)(__m128d a) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundpd(a, rounding);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundpd(a, rounding);
|
|
}
|
|
else
|
|
{
|
|
static if (rounding & _MM_FROUND_CUR_DIRECTION)
|
|
{
|
|
// Convert to 64-bit integers
|
|
long lo = _mm_cvtsd_si64(a);
|
|
a.ptr[0] = a.array[1];
|
|
long hi = _mm_cvtsd_si64(a);
|
|
return _mm_setr_pd(lo, hi);
|
|
}
|
|
else
|
|
{
|
|
version(GNU) pragma(inline, false); // else fail unittest with optimizations
|
|
|
|
uint old = _MM_GET_ROUNDING_MODE();
|
|
_MM_SET_ROUNDING_MODE((rounding & 3) << 13);
|
|
|
|
// Convert to 64-bit integers
|
|
long lo = _mm_cvtsd_si64(a);
|
|
a.ptr[0] = a.array[1];
|
|
long hi = _mm_cvtsd_si64(a);
|
|
|
|
// Convert back to double to achieve the rounding
|
|
// The problem is that a 64-bit double can't represent all the values
|
|
// a 64-bit integer can (and vice-versa). So this function won't work for
|
|
// large values. (MAYDO: what range exactly?)
|
|
_MM_SET_ROUNDING_MODE(old);
|
|
return _mm_setr_pd(lo, hi);
|
|
}
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
// tested in other intrinsics
|
|
}
|
|
|
|
/// Round the packed single-precision (32-bit) floating-point elements in `a` using the
|
|
/// rounding parameter, and store the results as packed single-precision floating-point elements.
|
|
/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
|
|
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
|
|
/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
|
|
/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
|
|
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
|
|
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
|
|
__m128 _mm_round_ps(int rounding)(__m128 a) @trusted
|
|
{
|
|
// PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
|
|
static if (GDC_or_LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundps(a, rounding);
|
|
}
|
|
else
|
|
{
|
|
static if (rounding & _MM_FROUND_CUR_DIRECTION)
|
|
{
|
|
__m128i integers = _mm_cvtps_epi32(a);
|
|
return _mm_cvtepi32_ps(integers);
|
|
}
|
|
else
|
|
{
|
|
version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
|
|
uint old = _MM_GET_ROUNDING_MODE();
|
|
_MM_SET_ROUNDING_MODE((rounding & 3) << 13);
|
|
scope(exit) _MM_SET_ROUNDING_MODE(old);
|
|
|
|
// Convert to 64-bit integers
|
|
__m128i integers = _mm_cvtps_epi32(a);
|
|
|
|
// Convert back to float to achieve the rounding
|
|
// The problem is that a 32-float can't represent all the values
|
|
// a 32-bit integer can (and vice-versa). So this function won't work for
|
|
// large values. (MAYDO: what range exactly?)
|
|
__m128 result = _mm_cvtepi32_ps(integers);
|
|
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
// tested in other intrinsics
|
|
}
|
|
|
|
|
|
/// Round the lower double-precision (64-bit) floating-point element in `b` using the
|
|
/// rounding parameter, store the result as a double-precision floating-point element
|
|
/// in the lower element of result, and copy the upper element from `a` to the upper element of result.
|
|
/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
|
|
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
|
|
/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
|
|
/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
|
|
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
|
|
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
|
|
__m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
|
|
{
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundsd(a, b, rounding);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundsd(a, b, rounding);
|
|
}
|
|
else
|
|
{
|
|
static if (rounding & _MM_FROUND_CUR_DIRECTION)
|
|
{
|
|
// Convert to 64-bit integer
|
|
long b0 = _mm_cvtsd_si64(b);
|
|
a.ptr[0] = b0;
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
version(GNU) pragma(inline, false); // else fail unittest with optimizations
|
|
|
|
uint old = _MM_GET_ROUNDING_MODE();
|
|
_MM_SET_ROUNDING_MODE((rounding & 3) << 13);
|
|
|
|
// Convert to 64-bit integer
|
|
long b0 = _mm_cvtsd_si64(b);
|
|
a.ptr[0] = b0;
|
|
|
|
// Convert back to double to achieve the rounding
|
|
// The problem is that a 64-bit double can't represent all the values
|
|
// a 64-bit integer can (and vice-versa). So this function won't work for
|
|
// large values. (MAYDO: what range exactly?)
|
|
_MM_SET_ROUNDING_MODE(old);
|
|
return a;
|
|
}
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
// tested in other intrinsics
|
|
}
|
|
|
|
|
|
/// Round the lower single-precision (32-bit) floating-point element in `b` using the
|
|
/// rounding parameter, store the result as a single-precision floating-point element
|
|
/// in the lower element of result, and copy the upper 3 packed elements from `a`
|
|
/// to the upper elements of result.
|
|
/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
|
|
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
|
|
/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
|
|
/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
|
|
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
|
|
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
|
|
__m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
|
|
{
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundss(a, b, rounding);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_roundss(a, b, rounding);
|
|
}
|
|
else
|
|
{
|
|
static if (rounding & _MM_FROUND_CUR_DIRECTION)
|
|
{
|
|
int b0 = _mm_cvtss_si32(b);
|
|
a.ptr[0] = b0;
|
|
return a;
|
|
}
|
|
else version(GNU)
|
|
{
|
|
pragma(inline, false)
|
|
__m128 GDCworkaround() nothrow @nogc @trusted
|
|
{
|
|
uint old = _MM_GET_ROUNDING_MODE();
|
|
_MM_SET_ROUNDING_MODE((rounding & 3) << 13);
|
|
|
|
// Convert to 32-bit integer
|
|
int b0 = _mm_cvtss_si32(b);
|
|
a.ptr[0] = b0;
|
|
|
|
// Convert back to double to achieve the rounding
|
|
// The problem is that a 32-bit float can't represent all the values
|
|
// a 32-bit integer can (and vice-versa). So this function won't work for
|
|
// large values. (MAYDO: what range exactly?)
|
|
_MM_SET_ROUNDING_MODE(old);
|
|
return a;
|
|
}
|
|
return GDCworkaround();
|
|
}
|
|
else
|
|
{
|
|
uint old = _MM_GET_ROUNDING_MODE();
|
|
_MM_SET_ROUNDING_MODE((rounding & 3) << 13);
|
|
|
|
// Convert to 32-bit integer
|
|
int b0 = _mm_cvtss_si32(b);
|
|
a.ptr[0] = b0;
|
|
|
|
// Convert back to double to achieve the rounding
|
|
// The problem is that a 32-bit float can't represent all the values
|
|
// a 32-bit integer can (and vice-versa). So this function won't work for
|
|
// large values. (MAYDO: what range exactly?)
|
|
_MM_SET_ROUNDING_MODE(old);
|
|
return a;
|
|
}
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
// tested in other intrinsics
|
|
}
|
|
|
|
|
|
/// Load 128-bits of integer data from memory using a non-temporal memory hint.
|
|
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
|
|
/// exception may be generated.
|
|
__m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted
|
|
{
|
|
// PERF DMD D_SIMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
|
|
}
|
|
else static if (LDC_with_InlineIREx && LDC_with_optimizations)
|
|
{
|
|
enum prefix = `!0 = !{ i32 1 }`;
|
|
enum ir = `
|
|
%r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
|
|
ret <4 x i32> %r`;
|
|
return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr);
|
|
}
|
|
else
|
|
{
|
|
return *cast(__m128i*)mem_addr; // regular move instead
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
align(16) static immutable int[4] correct = [1, 2, 3, 4];
|
|
__m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr));
|
|
_mm_mfence();
|
|
assert(A.array == correct);
|
|
}
|
|
|
|
/// Return 1 if all bits in `a` are all 1's. Else return 0.
|
|
int _mm_test_all_ones (__m128i a) @safe
|
|
{
|
|
return _mm_testc_si128(a, _mm_set1_epi32(-1));
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_set1_epi32(-1);
|
|
__m128i B = _mm_set_epi32(-1, -2, -1, -1);
|
|
assert(_mm_test_all_ones(A) == 1);
|
|
assert(_mm_test_all_ones(B) == 0);
|
|
}
|
|
|
|
/// Return 1 if all bits in `a` are all 0's. Else return 0.
|
|
// This is a #BONUS since it was lacking in Intel Intrinsics API.
|
|
int _mm_test_all_zeros (__m128i a) @safe
|
|
{
|
|
return _mm_testz_si128(a, _mm_set1_epi32(-1));
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_set1_epi32(0);
|
|
__m128i B = _mm_set_epi32(0, 8, 0, 0);
|
|
assert(_mm_test_all_zeros(A) == 1);
|
|
assert(_mm_test_all_zeros(B) == 0);
|
|
}
|
|
|
|
/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`,
|
|
/// and return 1 if the result is zero, otherwise return 0.
|
|
int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
|
|
{
|
|
return _mm_testz_si128(a, mask); // it's really the same, but with a good name
|
|
}
|
|
|
|
/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1
|
|
/// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with
|
|
/// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
|
|
/// CF values are zero, otherwise return 0.
|
|
int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
|
|
{
|
|
return _mm_testnzc_si128(a, mask);
|
|
}
|
|
|
|
/// Compute the bitwise NOT of a and then AND with b, and return 1 if the
|
|
/// result is zero, otherwise return 0.
|
|
/// In other words, test if all bits masked by `b` are 1 in `a`.
|
|
int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// Acceptable since LDC 1.8 -02
|
|
long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
|
|
return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
|
|
}
|
|
else
|
|
{
|
|
__m128i c = ~a & b;
|
|
int[4] zero = [0, 0, 0, 0];
|
|
return c.array == zero;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
|
|
__m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
|
|
__m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
|
|
assert(_mm_testc_si128(A, A) == 1);
|
|
assert(_mm_testc_si128(A, M1) == 0);
|
|
assert(_mm_testc_si128(A, M2) == 1);
|
|
}
|
|
|
|
/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`,
|
|
/// and set ZF to 1 if the result is zero, otherwise set ZF to 0.
|
|
/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the
|
|
/// result is zero, otherwise set CF to 0.
|
|
/// Return 1 if both the ZF and CF values are zero, otherwise return 0.
|
|
int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
|
|
long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
|
|
|
|
return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
|
|
| !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
|
|
}
|
|
else
|
|
{
|
|
__m128i c = a & b;
|
|
__m128i d = ~a & b;
|
|
int[4] zero = [0, 0, 0, 0];
|
|
return !( (c.array == zero) || (d.array == zero));
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
|
|
__m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
|
|
__m128i Z = _mm_setzero_si128();
|
|
assert(_mm_testnzc_si128(A, Z) == 0);
|
|
assert(_mm_testnzc_si128(A, M) == 1);
|
|
assert(_mm_testnzc_si128(A, A) == 0);
|
|
}
|
|
|
|
/// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
|
|
/// and return 1 if the result is zero, otherwise return 0.
|
|
/// In other words, test if all bits masked by `b` are 0 in `a`.
|
|
int _mm_testz_si128 (__m128i a, __m128i b) @trusted
|
|
{
|
|
// PERF DMD
|
|
static if (GDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
|
|
}
|
|
else static if (LDC_with_SSE41)
|
|
{
|
|
return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
|
|
}
|
|
else static if (LDC_with_ARM64)
|
|
{
|
|
// Acceptable since LDC 1.8 -02
|
|
long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
|
|
return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
|
|
}
|
|
else
|
|
{
|
|
__m128i c = a & b;
|
|
int[4] zero = [0, 0, 0, 0];
|
|
return c.array == zero;
|
|
}
|
|
}
|
|
unittest
|
|
{
|
|
__m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
|
|
__m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
|
|
__m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
|
|
assert(_mm_testz_si128(A, A) == 0);
|
|
assert(_mm_testz_si128(A, M1) == 1);
|
|
assert(_mm_testz_si128(A, M2) == 0);
|
|
}
|
|
|