/** * SSE4.1 intrinsics. * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 * * Copyright: Guillaume Piolat 2021. * Johan Engelen 2021. * cet 2024. * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) */ module inteli.smmintrin; // SSE4.1 instructions // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 // Note: this header will work whether you have SSE4.1 enabled or not. // With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively // generate SSE4.1 instructions. // With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. public import inteli.types; import inteli.internals; // smmintrin pulls in all previous instruction set intrinsics. public import inteli.tmmintrin; nothrow @nogc: enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto enum int _MM_FROUND_NO_EXC = 0x08; /// ditto enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); /// Add packed signed 32-bit integers in `a` and `b` using saturation. /// #BONUS __m128i _mm_adds_epi32(__m128i a, __m128i b) pure { // PERF: ARM64 should use 2x vqadd_s32 static if (LDC_with_saturated_intrinsics) return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b); else { __m128i int_max = _mm_set1_epi32(0x7FFFFFFF); __m128i res = _mm_add_epi32(a, b); __m128i sign_bit = _mm_srli_epi32(a, 31); __m128i sign_xor = _mm_xor_si128(a, b); __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res)); __m128i saturated = _mm_add_epi32(int_max, sign_bit); return cast(__m128i) _mm_blendv_ps(cast(__m128)res, cast(__m128)saturated, cast(__m128)overflow); } } unittest { __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min); __m128i b = _mm_setr_epi32(1, 2, 3, -4); assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]); } /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. // Note: changed signature, GDC needs a compile-time value for imm8. __m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16 return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); } else { // LDC x86 This generates pblendw since LDC 1.1 and -O2 short8 r; short8 sa = cast(short8)a; short8 sb = cast(short8)b; for (int n = 0; n < 8; ++n) { r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; } return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; assert(C.array == correct); } /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. // Note: changed signature, GDC needs a compile-time value for `imm8`. __m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted { static assert(imm8 >= 0 && imm8 < 4); // PERF DMD static if (GDC_with_SSE41) { return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); } else { // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 double2 r; for (int n = 0; n < 2; ++n) { r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; } return cast(__m128d)r; } } unittest { __m128d A = _mm_setr_pd(0, 1); __m128d B = _mm_setr_pd(8, 9); double2 C = _mm_blend_pd!2(A, B); double[2] correct = [0, 9]; assert(C.array == correct); } /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control /// mask `imm8`. // Note: changed signature, GDC needs a compile-time value for imm8. __m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted { // PERF DMD static assert(imm8 >= 0 && imm8 < 16); static if (GDC_with_SSE41) { return __builtin_ia32_blendps(a, b, imm8); } else version(LDC) { // LDC x86: generates blendps since LDC 1.1 -O2 // arm64: pretty good, two instructions worst case return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, (imm8 & 2) ? 5 : 1, (imm8 & 4) ? 6 : 2, (imm8 & 8) ? 7 : 3)(a, b); } else { // PERF GDC without SSE4.1 is quite bad __m128 r; for (int n = 0; n < 4; ++n) { r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; } return r; } } unittest { __m128 A = _mm_setr_ps(0, 1, 2, 3); __m128 B = _mm_setr_ps(8, 9, 10, 11); float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 float[4] correct = [8, 1, 10, 11]; assert(C.array == correct); } /// Blend packed 8-bit integers from `a` and `b` using `mask`. /// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`. __m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted { // PERF DMD /*static if (GDC_with_SSE41) { // This intrinsic do nothing in GDC 12. // TODO report to GDC. No problem in GCC. return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); } else*/ static if (LDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); } else static if (LDC_with_ARM64) { // LDC arm64: two instructions since LDC 1.12 -O2 byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); } else { __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); } } unittest { __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, 1, 1, -1, -1, 4, 1, 8, -128); byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, 8, 9, 26, 27, 12, 13, 14, 31 ]; assert(R.array == correct); } /// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. __m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted { // PERF DMD static if (GDC_with_SSE42) { // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction // with -msse4.2 but not -msse4.1. // Not sure what is the reason, and there is a replacement sequence. // Sounds like a bug. return __builtin_ia32_blendvpd(a, b, mask); } else static if (LDC_with_SSE41) { return __builtin_ia32_blendvpd(a, b, mask); } else static if (LDC_with_ARM64) { long2 shift; shift = 63; long2 lmask = cast(long2)mask >> shift; return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); } else { __m128d r; // PERF =void; long2 lmask = cast(long2)mask; for (int n = 0; n < 2; ++n) { r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; } return r; } } unittest { __m128d A = _mm_setr_pd(1.0, 2.0); __m128d B = _mm_setr_pd(3.0, 4.0); __m128d M1 = _mm_setr_pd(-3.0, 2.0); __m128d R1 = _mm_blendv_pd(A, B, M1); double[2] correct1 = [3.0, 2.0]; assert(R1.array == correct1); // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost // See Issue #78 __m128d M2 = _mm_setr_pd(double.nan, double.infinity); __m128d R2 = _mm_blendv_pd(A, B, M2); double[2] correct2 = [1.0, 2.0]; assert(R2.array == correct2); } /// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. __m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_blendvps(a, b, mask); } else static if (LDC_with_SSE41) { return __builtin_ia32_blendvps(a, b, mask); } else static if (LDC_with_ARM64) { int4 shift; shift = 31; int4 lmask = cast(int4)mask >> shift; return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); } else { // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2 // If lack of optimization, consider replacing by: // __m128i overflow_mask = _mm_srai_epi32(overflow, 31); // return _mm_or_si128( // _mm_and_si128(overflow_mask, saturated), // _mm_andnot_si128(overflow_mask, res) // LLVM makes almost the same sequence when optimized. __m128 r; int4 lmask = cast(int4)mask; for (int n = 0; n < 4; ++n) { r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; } return r; } } unittest { __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); __m128 R1 = _mm_blendv_ps(A, B, M1); __m128 R2 = _mm_blendv_ps(A, B, M2); float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; assert(R1.array == correct1); // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost // See Issue #78 assert(R2.array == correct2); } /// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, /// and store the results as packed double-precision floating-point elements. __m128d _mm_ceil_pd (__m128d a) @trusted { static if (LDC_with_ARM64) { // LDC arm64 acceptable since 1.8 -O2 // Unfortunately x86 intrinsics force a round-trip back to double2 // ARM neon semantics wouldn't have that long2 l = vcvtpq_s64_f64(a); double2 r; r.ptr[0] = l.array[0]; r.ptr[1] = l.array[1]; return r; } else { return _mm_round_pd!2(a); } } unittest { __m128d A = _mm_setr_pd(1.3f, -2.12f); __m128d B = _mm_setr_pd(53.6f, -2.7f); A = _mm_ceil_pd(A); B = _mm_ceil_pd(B); double[2] correctA = [2.0, -2.0]; double[2] correctB = [54.0, -2.0]; assert(A.array == correctA); assert(B.array == correctB); } /// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, /// and store the results as packed single-precision floating-point elements. __m128 _mm_ceil_ps (__m128 a) @trusted { static if (LDC_with_ARM64) { // LDC arm64 acceptable since 1.8 -O1 int4 l = vcvtpq_s32_f32(a); float4 r; r.ptr[0] = l.array[0]; r.ptr[1] = l.array[1]; r.ptr[2] = l.array[2]; r.ptr[3] = l.array[3]; return r; } else { return _mm_round_ps!2(a); } } unittest { __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); __m128 C = _mm_ceil_ps(A); float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; assert(C.array == correct); } /// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, /// store the result as a double-precision floating-point element in the lower element of result, /// and copy the upper element from `a` to the upper element of dst. __m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted { static if (LDC_with_ARM64) { a[0] = vcvtps_s64_f64(b[0]); return a; } else { return _mm_round_sd!2(a, b); } } unittest { __m128d A = _mm_setr_pd(1.3, -2.12); __m128d B = _mm_setr_pd(53.6, -3.7); __m128d C = _mm_ceil_sd(A, B); double[2] correct = [54.0, -2.12]; assert(C.array == correct); } /// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, /// store the result as a single-precision floating-point element in the lower element of result, /// and copy the upper 3 packed elements from `a` to the upper elements of result. __m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted { static if (LDC_with_ARM64) { a[0] = vcvtps_s32_f32(b[0]); return a; } else { return _mm_round_ss!2(a, b); } } unittest { __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); __m128 C = _mm_ceil_ss(A, B); float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; assert(C.array == correct); } /// Compare packed 64-bit integers in `a` and `b` for equality. __m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted { static if (SIMD_COMPARISON_MASKS_16B) { version(DigitalMars) { // DMD doesn't recognize long2 == long2 long2 la = cast(long2)a; long2 lb = cast(long2)b; long2 res; res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; return cast(__m128i)res; } else { return cast(__m128i)(cast(long2)a == cast(long2)b); } } else static if (GDC_with_SSE41) { return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); } else version(LDC) { // LDC x86: generates pcmpeqq since LDC 1.1 -O1 // arm64: generates cmeq since LDC 1.8 -O1 return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); } else { // Clever pcmpeqd + pand use with LDC 1.24 -O2 long2 la = cast(long2)a; long2 lb = cast(long2)b; long2 res; res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; return cast(__m128i)res; } } unittest { __m128i A = _mm_setr_epi64(-1, -2); __m128i B = _mm_setr_epi64(-3, -2); __m128i C = _mm_setr_epi64(-1, -4); long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); long[2] correct1 = [0, -1]; long[2] correct2 = [-1, 0]; assert(AB.array == correct1); assert(AC.array == correct2); } /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. __m128i _mm_cvtepi16_epi32 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); } else static if (LDC_with_optimizations) { // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 enum ir = ` %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> %r = sext <4 x i16> %v to <4 x i32> ret <4 x i32> %r`; return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); } else { short8 sa = cast(short8)a; int4 r; r.ptr[0] = sa.array[0]; r.ptr[1] = sa.array[1]; r.ptr[2] = sa.array[2]; r.ptr[3] = sa.array[3]; return r; } } unittest { __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); int4 C = cast(int4) _mm_cvtepi16_epi32(A); int[4] correct = [-1, 0, -32768, 32767]; assert(C.array == correct); } /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. __m128i _mm_cvtepi16_epi64 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); } else static if (LDC_with_optimizations) { // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 enum ir = ` %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> %r = sext <2 x i16> %v to <2 x i64> ret <2 x i64> %r`; return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); } else { short8 sa = cast(short8)a; long2 r; r.ptr[0] = sa.array[0]; r.ptr[1] = sa.array[1]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); long2 C = cast(long2) _mm_cvtepi16_epi64(A); long[2] correct = [-32768, 32767]; assert(C.array == correct); } /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. __m128i _mm_cvtepi32_epi64 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); } else static if (LDC_with_optimizations) { // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 enum ir = ` %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> %r = sext <2 x i32> %v to <2 x i64> ret <2 x i64> %r`; return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); } else { int4 sa = cast(int4)a; long2 r; r.ptr[0] = sa.array[0]; r.ptr[1] = sa.array[1]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi32(-4, 42, 0, 0); long2 C = cast(long2) _mm_cvtepi32_epi64(A); long[2] correct = [-4, 42]; assert(C.array == correct); } /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. __m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { alias ubyte16 = __vector(ubyte[16]); return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); } else static if (LDC_with_optimizations) { // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 // LDC ARM64: sshll generated since LDC 1.8.0 -O1 enum ir = ` %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> %r = sext <8 x i8> %v to <8 x i16> ret <8 x i16> %r`; return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); } else { byte16 sa = cast(byte16)a; short8 r; foreach(n; 0..8) r.ptr[n] = sa.array[n]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); short8 C = cast(short8) _mm_cvtepi8_epi16(A); short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; assert(C.array == correct); } /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. __m128i _mm_cvtepi8_epi32 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { alias ubyte16 = __vector(ubyte[16]); return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); } else static if (LDC_with_SSE41 && LDC_with_optimizations) { // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 enum ir = ` %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> %r = sext <4 x i8> %v to <4 x i32> ret <4 x i32> %r`; return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); } else { // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would byte16 sa = cast(byte16)a; int4 r; r.ptr[0] = sa.array[0]; r.ptr[1] = sa.array[1]; r.ptr[2] = sa.array[2]; r.ptr[3] = sa.array[3]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); int4 C = cast(int4) _mm_cvtepi8_epi32(A); int[4] correct = [127, -128, 1, -1]; assert(C.array == correct); } /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. __m128i _mm_cvtepi8_epi64 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { alias ubyte16 = __vector(ubyte[16]); return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); } else static if (LDC_with_optimizations) { // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, // LDC arm64: it's ok since LDC 1.8 -O1 enum ir = ` %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> %r = sext <2 x i8> %v to <2 x i64> ret <2 x i64> %r`; return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); } else { byte16 sa = cast(byte16)a; long2 r; foreach(n; 0..2) r.ptr[n] = sa.array[n]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); long2 C = cast(long2) _mm_cvtepi8_epi64(A); long[2] correct = [127, -128]; assert(C.array == correct); } /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. __m128i _mm_cvtepu16_epi32 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); } else { // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 // arm64: ushll since LDC 1.12 -O1 short8 sa = cast(short8)a; int4 r; r.ptr[0] = cast(ushort)sa.array[0]; r.ptr[1] = cast(ushort)sa.array[1]; r.ptr[2] = cast(ushort)sa.array[2]; r.ptr[3] = cast(ushort)sa.array[3]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); int4 C = cast(int4) _mm_cvtepu16_epi32(A); int[4] correct = [65535, 0, 32768, 32767]; assert(C.array == correct); } /// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. __m128i _mm_cvtepu16_epi64 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); } else static if (LDC_with_ARM64) { // LDC arm64: a bit shorter than below, in -O2 short8 sa = cast(short8)a; long2 r; for(int n = 0; n < 2; ++n) r.ptr[n] = cast(ushort)sa.array[n]; return cast(__m128i)r; } else { // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 short8 sa = cast(short8)a; long2 r; r.ptr[0] = cast(ushort)sa.array[0]; r.ptr[1] = cast(ushort)sa.array[1]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); long2 C = cast(long2) _mm_cvtepu16_epi64(A); long[2] correct = [65535, 0]; assert(C.array == correct); } /// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. __m128i _mm_cvtepu32_epi64 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); } else { // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 // arm64: generates ushll since LDC 1.12 -O1 int4 sa = cast(int4)a; long2 r; r.ptr[0] = cast(uint)sa.array[0]; r.ptr[1] = cast(uint)sa.array[1]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi32(-1, 42, 0, 0); long2 C = cast(long2) _mm_cvtepu32_epi64(A); long[2] correct = [4294967295, 42]; assert(C.array == correct); } /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. __m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); } else static if (LDC_with_optimizations) { enum ir = ` %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> %r = zext <8 x i8> %v to <8 x i16> ret <8 x i16> %r`; return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); } else { return _mm_unpacklo_epi8(a, _mm_setzero_si128()); } } unittest { __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); short8 C = cast(short8) _mm_cvtepu8_epi16(A); short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; assert(C.array == correct); } /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. __m128i _mm_cvtepu8_epi32 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { alias ubyte16 = __vector(ubyte[16]); return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); } else static if (LDC_with_ARM64) { // LDC arm64: a bit better than below in -O2 byte16 sa = cast(byte16)a; int4 r; for(int n = 0; n < 4; ++n) r.ptr[n] = cast(ubyte)sa.array[n]; return cast(__m128i)r; } else { // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 // PERF: catastrophic with GDC without SSE4.1 byte16 sa = cast(byte16)a; int4 r; r.ptr[0] = cast(ubyte)sa.array[0]; r.ptr[1] = cast(ubyte)sa.array[1]; r.ptr[2] = cast(ubyte)sa.array[2]; r.ptr[3] = cast(ubyte)sa.array[3]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); int4 C = cast(int4) _mm_cvtepu8_epi32(A); int[4] correct = [127, 128, 1, 255]; assert(C.array == correct); } /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. __m128i _mm_cvtepu8_epi64 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { alias ubyte16 = __vector(ubyte[16]); return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); } else static if (LDC_with_ARM64) { // LDC arm64: this optimizes better than the loop below byte16 sa = cast(byte16)a; long2 r; for (int n = 0; n < 2; ++n) r.ptr[n] = cast(ubyte)sa.array[n]; return cast(__m128i)r; } else { // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 byte16 sa = cast(byte16)a; long2 r; r.ptr[0] = cast(ubyte)sa.array[0]; r.ptr[1] = cast(ubyte)sa.array[1]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); long2 C = cast(long2) _mm_cvtepu8_epi64(A); long[2] correct = [127, 254]; assert(C.array == correct); } /// Conditionally multiply the packed double-precision (64-bit) floating-point elements /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally /// store the sum in dst using the low 4 bits of `imm8`. __m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_dppd(a, b, imm8 & 0x33); } else static if (LDC_with_SSE41) { return __builtin_ia32_dppd(a, b, imm8 & 0x33); } else { __m128d zero = _mm_setzero_pd(); __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); double sum = temp.array[0] + temp.array[1]; return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); } } unittest { __m128d A = _mm_setr_pd(1.0, 2.0); __m128d B = _mm_setr_pd(4.0, 8.0); double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); double[2] correct1 = [ 4.0, 4.0]; double[2] correct2 = [16.0, 0.0]; double[2] correct3 = [ 0.0, 20.0]; assert(R1.array == correct1); assert(R2.array == correct2); assert(R3.array == correct3); } /// Conditionally multiply the packed single-precision (32-bit) floating-point elements /// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, /// and conditionally store the sum in result using the low 4 bits of `imm8`. __m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); } else static if (LDC_with_SSE41) { return __builtin_ia32_dpps(a, b, cast(byte)imm8); } else { __m128 zero = _mm_setzero_ps(); __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); } } unittest { __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; assert(R1.array == correct1); assert(R2.array == correct2); assert(R3.array == correct3); } /// Extract a 32-bit integer from `a`, selected with `imm8`. int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted { return (cast(int4)a).array[imm8 & 3]; } unittest { __m128i A = _mm_setr_epi32(1, 2, 3, 4); assert(_mm_extract_epi32(A, 0) == 1); assert(_mm_extract_epi32(A, 1 + 8) == 2); assert(_mm_extract_epi32(A, 3 + 4) == 4); } /// Extract a 64-bit integer from `a`, selected with `imm8`. long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted { long2 la = cast(long2)a; return la.array[imm8 & 1]; } unittest { __m128i A = _mm_setr_epi64(45, -67); assert(_mm_extract_epi64(A, 0) == 45); assert(_mm_extract_epi64(A, 1) == -67); assert(_mm_extract_epi64(A, 2) == 45); } /// Extract an 8-bit integer from `a`, selected with `imm8`. /// Warning: the returned value is zero-extended to 32-bits. int _mm_extract_epi8 (__m128i a, const int imm8) @trusted { byte16 ba = cast(byte16)a; return cast(ubyte) ba.array[imm8 & 15]; } unittest { __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); assert(_mm_extract_epi8(A, 7) == 7); assert(_mm_extract_epi8(A, 13) == 255); assert(_mm_extract_epi8(A, 7 + 16) == 7); } /// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. /// Note: returns a 32-bit $(I integer). int _mm_extract_ps (__m128 a, const int imm8) @trusted { return (cast(int4)a).array[imm8 & 3]; } unittest { __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); assert(_mm_extract_ps(A, 0) == 0x3f800000); assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); } /// Round the packed double-precision (64-bit) floating-point elements in `a` down to an /// integer value, and store the results as packed double-precision floating-point elements. __m128d _mm_floor_pd (__m128d a) @trusted { static if (LDC_with_ARM64) { // LDC arm64 acceptable since 1.8 -O2 long2 l = vcvtmq_s64_f64(a); double2 r; r.ptr[0] = l.array[0]; r.ptr[1] = l.array[1]; return r; } else { return _mm_round_pd!1(a); } } unittest { __m128d A = _mm_setr_pd(1.3f, -2.12f); __m128d B = _mm_setr_pd(53.6f, -2.7f); A = _mm_floor_pd(A); B = _mm_floor_pd(B); double[2] correctA = [1.0, -3.0]; double[2] correctB = [53.0, -3.0]; assert(A.array == correctA); assert(B.array == correctB); } /// Round the packed single-precision (32-bit) floating-point elements in `a` down to an /// integer value, and store the results as packed single-precision floating-point elements. __m128 _mm_floor_ps (__m128 a) @trusted { static if (LDC_with_ARM64) { // LDC arm64 acceptable since 1.8 -O1 int4 l = vcvtmq_s32_f32(a); float4 r; r.ptr[0] = l.array[0]; r.ptr[1] = l.array[1]; r.ptr[2] = l.array[2]; r.ptr[3] = l.array[3]; return r; } else { return _mm_round_ps!1(a); } } unittest { __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); __m128 C = _mm_floor_ps(A); float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; assert(C.array == correct); } /// Round the lower double-precision (64-bit) floating-point element in `b` down to an /// integer value, store the result as a double-precision floating-point element in the /// lower element, and copy the upper element from `a` to the upper element. __m128d _mm_floor_sd (__m128d a, __m128d b) @trusted { static if (LDC_with_ARM64) { a[0] = vcvtms_s64_f64(b[0]); return a; } else { return _mm_round_sd!1(a, b); } } unittest { __m128d A = _mm_setr_pd(1.3, -2.12); __m128d B = _mm_setr_pd(-53.1, -3.7); __m128d C = _mm_floor_sd(A, B); double[2] correct = [-54.0, -2.12]; assert(C.array == correct); } /// Round the lower single-precision (32-bit) floating-point element in `b` down to an /// integer value, store the result as a single-precision floating-point element in the /// lower element, and copy the upper 3 packed elements from `a` to the upper elements. __m128 _mm_floor_ss (__m128 a, __m128 b) @trusted { static if (LDC_with_ARM64) { a[0] = vcvtms_s32_f32(b[0]); return a; } else { return _mm_round_ss!1(a, b); } } unittest { __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); __m128 C = _mm_floor_ss(A, B); float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; assert(C.array == correct); } /// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted { // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 // LDC arm64: ins.s since LDC 1.8 -O2 int4 ia = cast(int4)a; ia.ptr[imm8 & 3] = i; return cast(__m128i)ia; } unittest { __m128i A = _mm_setr_epi32(1, 2, 3, 4); int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); int[4] result = [1, 2, 5, 4]; assert(C.array == result); } /// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. __m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted { // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 // LDC x86: always do something sensible. long2 la = cast(long2)a; la.ptr[imm8 & 1] = i; return cast(__m128i)la; } unittest { __m128i A = _mm_setr_epi64(1, 2); long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); long[2] result = [1, 5]; assert(C.array == result); } /// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. /// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. __m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted { // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. byte16 ba = cast(byte16)a; ba.ptr[imm8 & 15] = cast(byte)i; return cast(__m128i)ba; } unittest { __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; assert(C.array == result); } /// Warning: of course it does something totally different from `_mm_insert_epi32`! /// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` /// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` /// (elements are zeroed out when the corresponding bit is set). __m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); } else static if (LDC_with_SSE41) { return __builtin_ia32_insertps128(a, b, cast(byte)imm8); } else { float4 tmp2 = a; float tmp1 = b.array[(imm8 >> 6) & 3]; tmp2.ptr[(imm8 >> 4) & 3] = tmp1; return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); } } unittest { __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; assert(C.array == correct); } /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. __m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted { static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); } else version(LDC) { // x86: pmaxsd since LDC 1.1 -O1 // ARM: smax.4s since LDC 1.8 -01 int4 sa = cast(int4)a; int4 sb = cast(int4)b; static if (SIMD_COMPARISON_MASKS_16B) int4 greater = sa > sb; else int4 greater = greaterMask!int4(sa, sb); return cast(__m128i)( (greater & sa) | (~greater & sb) ); } else { __m128i higher = _mm_cmpgt_epi32(a, b); __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b __m128i mask = _mm_and_si128(aTob, higher); return _mm_xor_si128(b, mask); } } unittest { int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), _mm_setr_epi32( -4,-8, 9, -8)); int[4] correct = [0x7fffffff, 1, 9, 7]; assert(R.array == correct); } /// Compare packed signed 8-bit integers in `a` and `b`, /// and return packed maximum values. __m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); } else version(LDC) { // x86: pmaxsb since LDC 1.1 -O1 // ARM64: smax.16b since LDC 1.8.0 -O1 byte16 sa = cast(byte16)a; byte16 sb = cast(byte16)b; static if (SIMD_COMPARISON_MASKS_16B) byte16 greater = sa > sb; else byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); return cast(__m128i)( (greater & sa) | (~greater & sb) ); } else { __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b __m128i mask = _mm_and_si128(aTob, lower); return _mm_xor_si128(b, mask); } } unittest { __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); byte16 R = cast(byte16) _mm_max_epi8(A, B); byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; assert(R.array == correct); } /// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. __m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); } else version(LDC) { // x86: pmaxuw since LDC 1.1 -O1 // ARM64: umax.8h since LDC 1.8.0 -O1 // PERF: without sse4.1, LLVM 12 produces a very interesting // psubusw xmm0, xmm1 // paddw xmm0, xmm1 // sequence that maybe should go in other min/max intrinsics? ushort8 sa = cast(ushort8)a; ushort8 sb = cast(ushort8)b; static if (SIMD_COMPARISON_MASKS_16B) { // Note: doesn't work well with GDC, which prefers the builtin. ushort8 greater = sa > sb; } else ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); return cast(__m128i)( (greater & sa) | (~greater & sb) ); } else { b = _mm_subs_epu16(b, a); b = _mm_add_epi16(b, a); return b; } } unittest { short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; assert(R.array == correct); } /// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. __m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); } else version(LDC) { // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 // ARM64: umax.4s since LDC 1.8.0 -O1 uint4 sa = cast(uint4)a; uint4 sb = cast(uint4)b; static if (SIMD_COMPARISON_MASKS_16B) uint4 greater = sa > sb; else uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); return cast(__m128i)( (greater & sa) | (~greater & sb) ); } else { // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128" /+ movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000] movdqa xmm3, xmm1 pxor xmm3, xmm2 pxor xmm2, xmm0 pcmpgtd xmm2, xmm3 pand xmm0, xmm2 pandn xmm2, xmm1 por xmm0, xmm2 +/ __m128i valueShift = _mm_set1_epi32(-0x80000000); __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b __m128i mask = _mm_and_si128(aTob, higher); return _mm_xor_si128(b, mask); } } unittest { int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), _mm_setr_epi32( -4,-8, 9, -8)); int[4] correct = [ -4,-8, 9, -7]; assert(R.array == correct); } /// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. __m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); } else version(LDC) { // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 // ARM: smin.4s since LDC 1.8 -01 int4 sa = cast(int4)a; int4 sb = cast(int4)b; static if (SIMD_COMPARISON_MASKS_16B) int4 greater = sa > sb; else int4 greater = greaterMask!int4(sa, sb); return cast(__m128i)( (~greater & sa) | (greater & sb) ); } else { __m128i higher = _mm_cmplt_epi32(a, b); __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b __m128i mask = _mm_and_si128(aTob, higher); return _mm_xor_si128(b, mask); } } unittest { int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), _mm_setr_epi32( -4, -8, 9, -8)); int[4] correct = [ -4, -8, -4, -8]; assert(R.array == correct); } /// Compare packed signed 8-bit integers in `a` and `b`, /// and return packed minimum values. __m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); } else version(LDC) { // x86: pminsb since LDC 1.1 -O1 // ARM64: smin.16b since LDC 1.8.0 -O1 byte16 sa = cast(byte16)a; byte16 sb = cast(byte16)b; static if (SIMD_COMPARISON_MASKS_16B) byte16 greater = sa > sb; else byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); return cast(__m128i)( (~greater & sa) | (greater & sb) ); } else { __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b __m128i mask = _mm_and_si128(aTob, lower); return _mm_xor_si128(b, mask); } } unittest { __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); byte16 R = cast(byte16) _mm_min_epi8(A, B); byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; assert(R.array == correct); } /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. __m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); } else version(LDC) { // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 // ARM64: umin.8h since LDC 1.8.0 -O1 ushort8 sa = cast(ushort8)a; ushort8 sb = cast(ushort8)b; static if (SIMD_COMPARISON_MASKS_16B) ushort8 greater = (sb > sa); else ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); return cast(__m128i)( (greater & sa) | (~greater & sb) ); } else { __m128i c = _mm_subs_epu16(b, a); b = _mm_sub_epi16(b, c); return b; } } unittest { short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; assert(R.array == correct); } /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. __m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); } else version(LDC) { // x86: pminud since LDC 1.1 -O1, also good without sse4.1 // ARM64: umin.4s since LDC 1.8.0 -O1 uint4 sa = cast(uint4)a; uint4 sb = cast(uint4)b; static if (SIMD_COMPARISON_MASKS_16B) uint4 greater = sa > sb; else uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); return cast(__m128i)( (~greater & sa) | (greater & sb) ); } else { // PERF: same remark as in _mm_max_epu32 __m128i valueShift = _mm_set1_epi32(-0x80000000); __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b __m128i mask = _mm_and_si128(aTob, higher); return _mm_xor_si128(b, mask); } } unittest { int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), _mm_setr_epi32( -4,-8, 9, -8)); int[4] correct = [0x7fffffff, 1, 4, -8]; assert(R.array == correct); } /// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, /// store the minimum and index in return value, and zero the remaining bits. __m128i _mm_minpos_epu16 (__m128i a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); } else static if (LDC_with_SSE41) { return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); } else static if (LDC_with_ARM64) { __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i combinedLo = _mm_unpacklo_epi16(indices, a); __m128i combinedHi = _mm_unpackhi_epi16(indices, a); __m128i best = _mm_min_epu32(combinedLo, combinedHi); best = _mm_min_epu32(best, _mm_srli_si128!8(best)); best = _mm_min_epu32(best, _mm_srli_si128!4(best)); short8 sbest = cast(short8)best; short8 r; r[0] = sbest[1]; r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie r[2] = 0; r[3] = 0; r[4] = 0; r[5] = 0; r[6] = 0; r[7] = 0; return cast(__m128i)r; } else { short8 sa = cast(short8)a; ushort min = 0xffff; int index = 0; for(int n = 0; n < 8; ++n) { ushort c = sa.array[n]; if (c < min) { min = c; index = n; } } short8 r; r.ptr[0] = min; r.ptr[1] = cast(short)index; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); short8 R1 = cast(short8) _mm_minpos_epu16(A); short8 R2 = cast(short8) _mm_minpos_epu16(B); short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; assert(R1.array == correct1); assert(R2.array == correct2); } /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers /// in `a` compared to those in `b`, and store the 16-bit results in dst. /// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. /// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. /// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting /// at the offset specified in `imm8[2]`. __m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); } else static if (LDC_with_SSE41) { return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); } else { int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... int b_offset = (imm8 & 3) * 4; byte16 ba = cast(byte16)a; byte16 bb = cast(byte16)b; short8 r; __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); for (int j = 0; j < 8; j += 2) { int k = a_offset + j; __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], 0, 0, 0, 0, ba[k+1], ba[k+2], ba[k+3], ba[k+4], 0, 0, 0, 0); short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 r.ptr[j] = diffs.array[0]; r.ptr[j+1] = diffs.array[4]; } return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); assert(r1.array == correct1); assert(r4.array == correct4); assert(r5.array == correct5); assert(r7.array == correct7); assert(r8.array == correct0); } /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. __m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); } else static if (LDC_with_SSE41 && LDC_with_optimizations) { // For some reason, clang has the builtin but it's not in IntrinsicsX86.td // Use IR instead. // This generates pmuldq with since LDC 1.2.0 -O0 enum ir = ` %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> %la = sext <2 x i32> %ia to <2 x i64> %lb = sext <2 x i32> %ib to <2 x i64> %r = mul <2 x i64> %la, %lb ret <2 x i64> %r`; return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); } else static if (LDC_with_ARM64) { // 3 instructions since LDC 1.8 -O2 // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull int2 a_lo = vmovn_s64(cast(long2)a); int2 b_lo = vmovn_s64(cast(long2)b); return cast(__m128i) vmull_s32(a_lo, b_lo); } else { int4 ia = cast(int4)a; int4 ib = cast(int4)b; long2 r; r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; return cast(__m128i)r; } } unittest { __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); long2 R = cast(long2) _mm_mul_epi32(A, B); long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; assert(R.array == correct); } /// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, /// return the low 32 bits of the intermediate integers. __m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted { // PERF DMD // PERF GDC without SSE4.1 could be better static if (GDC_with_SSE41) { int4 ia = cast(int4)a; int4 ib = cast(int4)b; // Note: older GDC doesn't have that op, but older GDC // also has no support for -msse4.1 detection return cast(__m128i)(a * b); } else version(LDC) { int4 ia = cast(int4)a; int4 ib = cast(int4)b; return cast(__m128i)(a * b); } else { // DMD doesn't take the above int4 ia = cast(int4)a; int4 ib = cast(int4)b; int4 r; r.ptr[0] = ia.array[0] * ib.array[0]; r.ptr[1] = ia.array[1] * ib.array[1]; r.ptr[2] = ia.array[2] * ib.array[2]; r.ptr[3] = ia.array[3] * ib.array[3]; return r; } } unittest { __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); int4 R = cast(int4) _mm_mullo_epi32(A, B); int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; assert(R.array == correct); } /// Convert packed signed 32-bit integers from `a` and `b` /// to packed 16-bit integers using unsigned saturation. __m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted { static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); } else static if (LDC_with_SSE41) { return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); } else static if (LDC_with_ARM64) { int4 z; z = 0; return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), vqmovn_u32(vmaxq_s32(z, cast(int4)b))); } else { __m128i i32768 = _mm_set1_epi32(32768); __m128i s32768 = _mm_set1_epi16(-32768); a = _mm_sub_epi32(a, i32768); b = _mm_sub_epi32(b, i32768); __m128i clampedSigned = _mm_packs_epi32(a, b); return _mm_add_epi16(clampedSigned, s32768); } } unittest { __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); short8 R = cast(short8) _mm_packus_epi32(A, A); short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; assert(R.array == correct); } /// Round the packed double-precision (64-bit) floating-point elements in `a` using the /// rounding parameter, and store the results as packed double-precision floating-point elements. /// Rounding is done according to the rounding[3:0] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE __m128d _mm_round_pd(int rounding)(__m128d a) @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_roundpd(a, rounding); } else static if (LDC_with_SSE41) { return __builtin_ia32_roundpd(a, rounding); } else { static if (rounding & _MM_FROUND_CUR_DIRECTION) { // Convert to 64-bit integers long lo = _mm_cvtsd_si64(a); a.ptr[0] = a.array[1]; long hi = _mm_cvtsd_si64(a); return _mm_setr_pd(lo, hi); } else { version(GNU) pragma(inline, false); // else fail unittest with optimizations uint old = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE((rounding & 3) << 13); // Convert to 64-bit integers long lo = _mm_cvtsd_si64(a); a.ptr[0] = a.array[1]; long hi = _mm_cvtsd_si64(a); // Convert back to double to achieve the rounding // The problem is that a 64-bit double can't represent all the values // a 64-bit integer can (and vice-versa). So this function won't work for // large values. (MAYDO: what range exactly?) _MM_SET_ROUNDING_MODE(old); return _mm_setr_pd(lo, hi); } } } unittest { // tested in other intrinsics } /// Round the packed single-precision (32-bit) floating-point elements in `a` using the /// rounding parameter, and store the results as packed single-precision floating-point elements. /// Rounding is done according to the rounding[3:0] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE __m128 _mm_round_ps(int rounding)(__m128 a) @trusted { // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally static if (GDC_or_LDC_with_SSE41) { return __builtin_ia32_roundps(a, rounding); } else { static if (rounding & _MM_FROUND_CUR_DIRECTION) { __m128i integers = _mm_cvtps_epi32(a); return _mm_cvtepi32_ps(integers); } else { version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled uint old = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE((rounding & 3) << 13); scope(exit) _MM_SET_ROUNDING_MODE(old); // Convert to 64-bit integers __m128i integers = _mm_cvtps_epi32(a); // Convert back to float to achieve the rounding // The problem is that a 32-float can't represent all the values // a 32-bit integer can (and vice-versa). So this function won't work for // large values. (MAYDO: what range exactly?) __m128 result = _mm_cvtepi32_ps(integers); return result; } } } unittest { // tested in other intrinsics } /// Round the lower double-precision (64-bit) floating-point element in `b` using the /// rounding parameter, store the result as a double-precision floating-point element /// in the lower element of result, and copy the upper element from `a` to the upper element of result. /// Rounding is done according to the rounding[3:0] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE __m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted { static if (GDC_with_SSE41) { return __builtin_ia32_roundsd(a, b, rounding); } else static if (LDC_with_SSE41) { return __builtin_ia32_roundsd(a, b, rounding); } else { static if (rounding & _MM_FROUND_CUR_DIRECTION) { // Convert to 64-bit integer long b0 = _mm_cvtsd_si64(b); a.ptr[0] = b0; return a; } else { version(GNU) pragma(inline, false); // else fail unittest with optimizations uint old = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE((rounding & 3) << 13); // Convert to 64-bit integer long b0 = _mm_cvtsd_si64(b); a.ptr[0] = b0; // Convert back to double to achieve the rounding // The problem is that a 64-bit double can't represent all the values // a 64-bit integer can (and vice-versa). So this function won't work for // large values. (MAYDO: what range exactly?) _MM_SET_ROUNDING_MODE(old); return a; } } } unittest { // tested in other intrinsics } /// Round the lower single-precision (32-bit) floating-point element in `b` using the /// rounding parameter, store the result as a single-precision floating-point element /// in the lower element of result, and copy the upper 3 packed elements from `a` /// to the upper elements of result. /// Rounding is done according to the rounding[3:0] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions /// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions /// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE __m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted { static if (GDC_with_SSE41) { return __builtin_ia32_roundss(a, b, rounding); } else static if (LDC_with_SSE41) { return __builtin_ia32_roundss(a, b, rounding); } else { static if (rounding & _MM_FROUND_CUR_DIRECTION) { int b0 = _mm_cvtss_si32(b); a.ptr[0] = b0; return a; } else version(GNU) { pragma(inline, false) __m128 GDCworkaround() nothrow @nogc @trusted { uint old = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE((rounding & 3) << 13); // Convert to 32-bit integer int b0 = _mm_cvtss_si32(b); a.ptr[0] = b0; // Convert back to double to achieve the rounding // The problem is that a 32-bit float can't represent all the values // a 32-bit integer can (and vice-versa). So this function won't work for // large values. (MAYDO: what range exactly?) _MM_SET_ROUNDING_MODE(old); return a; } return GDCworkaround(); } else { uint old = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE((rounding & 3) << 13); // Convert to 32-bit integer int b0 = _mm_cvtss_si32(b); a.ptr[0] = b0; // Convert back to double to achieve the rounding // The problem is that a 32-bit float can't represent all the values // a 32-bit integer can (and vice-versa). So this function won't work for // large values. (MAYDO: what range exactly?) _MM_SET_ROUNDING_MODE(old); return a; } } } unittest { // tested in other intrinsics } /// Load 128-bits of integer data from memory using a non-temporal memory hint. /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection /// exception may be generated. __m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted { // PERF DMD D_SIMD static if (GDC_with_SSE41) { return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); } else static if (LDC_with_InlineIREx && LDC_with_optimizations) { enum prefix = `!0 = !{ i32 1 }`; enum ir = ` %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 ret <4 x i32> %r`; return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr); } else { return *cast(__m128i*)mem_addr; // regular move instead } } unittest { align(16) static immutable int[4] correct = [1, 2, 3, 4]; __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr)); _mm_mfence(); assert(A.array == correct); } /// Return 1 if all bits in `a` are all 1's. Else return 0. int _mm_test_all_ones (__m128i a) @safe { return _mm_testc_si128(a, _mm_set1_epi32(-1)); } unittest { __m128i A = _mm_set1_epi32(-1); __m128i B = _mm_set_epi32(-1, -2, -1, -1); assert(_mm_test_all_ones(A) == 1); assert(_mm_test_all_ones(B) == 0); } /// Return 1 if all bits in `a` are all 0's. Else return 0. // This is a #BONUS since it was lacking in Intel Intrinsics API. int _mm_test_all_zeros (__m128i a) @safe { return _mm_testz_si128(a, _mm_set1_epi32(-1)); } unittest { __m128i A = _mm_set1_epi32(0); __m128i B = _mm_set_epi32(0, 8, 0, 0); assert(_mm_test_all_zeros(A) == 1); assert(_mm_test_all_zeros(B) == 0); } /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, /// and return 1 if the result is zero, otherwise return 0. int _mm_test_all_zeros (__m128i a, __m128i mask) @safe { return _mm_testz_si128(a, mask); // it's really the same, but with a good name } /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 /// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with /// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and /// CF values are zero, otherwise return 0. int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted { return _mm_testnzc_si128(a, mask); } /// Compute the bitwise NOT of a and then AND with b, and return 1 if the /// result is zero, otherwise return 0. /// In other words, test if all bits masked by `b` are 1 in `a`. int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); } else static if (LDC_with_SSE41) { return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); } else static if (LDC_with_ARM64) { // Acceptable since LDC 1.8 -02 long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } else { __m128i c = ~a & b; int[4] zero = [0, 0, 0, 0]; return c.array == zero; } } unittest { __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); assert(_mm_testc_si128(A, A) == 1); assert(_mm_testc_si128(A, M1) == 0); assert(_mm_testc_si128(A, M2) == 1); } /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, /// and set ZF to 1 if the result is zero, otherwise set ZF to 0. /// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the /// result is zero, otherwise set CF to 0. /// Return 1 if both the ZF and CF values are zero, otherwise return 0. int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); } else static if (LDC_with_SSE41) { return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); } else static if (LDC_with_ARM64) { long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); } else { __m128i c = a & b; __m128i d = ~a & b; int[4] zero = [0, 0, 0, 0]; return !( (c.array == zero) || (d.array == zero)); } } unittest { __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); __m128i Z = _mm_setzero_si128(); assert(_mm_testnzc_si128(A, Z) == 0); assert(_mm_testnzc_si128(A, M) == 1); assert(_mm_testnzc_si128(A, A) == 0); } /// Compute the bitwise AND of 128 bits (representing integer data) in a and b, /// and return 1 if the result is zero, otherwise return 0. /// In other words, test if all bits masked by `b` are 0 in `a`. int _mm_testz_si128 (__m128i a, __m128i b) @trusted { // PERF DMD static if (GDC_with_SSE41) { return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); } else static if (LDC_with_SSE41) { return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); } else static if (LDC_with_ARM64) { // Acceptable since LDC 1.8 -02 long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } else { __m128i c = a & b; int[4] zero = [0, 0, 0, 0]; return c.array == zero; } } unittest { __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); assert(_mm_testz_si128(A, A) == 0); assert(_mm_testz_si128(A, M1) == 1); assert(_mm_testz_si128(A, M2) == 0); }