diff --git a/assets/models/DamagedHelmet.glb b/assets/models/DamagedHelmet.glb new file mode 100644 index 0000000..2cee76d Binary files /dev/null and b/assets/models/DamagedHelmet.glb differ diff --git a/assets/shaders/gradient.comp.spv b/assets/shaders/gradient.comp.spv index 6b76056..6fdb516 100644 Binary files a/assets/shaders/gradient.comp.spv and b/assets/shaders/gradient.comp.spv differ diff --git a/assets/shaders/pbr.frag.spv b/assets/shaders/pbr.frag.spv index 4e609a2..f91b9e3 100644 Binary files a/assets/shaders/pbr.frag.spv and b/assets/shaders/pbr.frag.spv differ diff --git a/assets/shaders/pbr.vert.spv b/assets/shaders/pbr.vert.spv index 12d75f6..1a9aa13 100644 Binary files a/assets/shaders/pbr.vert.spv and b/assets/shaders/pbr.vert.spv differ diff --git a/dub.json b/dub.json index 516ed99..eee67ee 100644 --- a/dub.json +++ b/dub.json @@ -7,7 +7,7 @@ "targetType": "executable", "targetName": "Gears", "targetPath": "build", - "sourceFiles-linux": ["build/libvma.a", "build/libstb.a", "build/libm3d.a", "build/libcglm.a"], + "sourceFiles-linux": ["build/libvma.a", "build/libstb.a", "build/libm3d.a", "build/libcglm.a", "build/libcgltf.a"], "sourceFiles-windows": [], "importPaths": ["src/gears", "src/dlib", "src/dlib/external/xxhash", "src/VulkanRenderer"], "sourcePaths": ["src/gears", "src/dlib", "src/dlib/external/xxhash", "src/VulkanRenderer"], @@ -17,7 +17,7 @@ "preGenerateCommands-linux": ["./build.sh"], "preGenerateCommands-windows": [], "dflags": ["-Xcc=-mno-sse", "-P-I/usr/include/freetype2", "-Jbuild", "-Jassets/fonts"], - "dflags-dmd": ["-P=-DSTBI_NO_SIMD"] + "dflags-dmd": [] }, { "name": "packer", diff --git a/external/inteli/bmi2intrin.d b/external/inteli/bmi2intrin.d index 50bcde7..644a23c 100644 --- a/external/inteli/bmi2intrin.d +++ b/external/inteli/bmi2intrin.d @@ -1,363 +1,363 @@ -/** -* BMI2 intrinsics. -* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 -* -* Copyright: Copyright Johan Engelen 2021. -* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) -*/ -module inteli.bmi2intrin; - -import inteli.internals; - -nothrow @nogc pure @safe: - -/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. -uint _bzhi_u32 (uint a, uint index) -{ - static if (GDC_or_LDC_with_BMI2) - { - if (!__ctfe) - return __builtin_ia32_bzhi_si(a, index); - else - return bzhi!uint(a, index); - } - else - { - return bzhi!uint(a, index); - } -} -unittest -{ - static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); - assert (_bzhi_u32(0x1234_5678, 5) == 0x18); - static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); - assert (_bzhi_u32(0x1234_5678, 10) == 0x278); - static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); - assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); -} - -/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. -ulong _bzhi_u64 (ulong a, uint index) -{ - static if (GDC_or_LDC_with_BMI2) - { - if (!__ctfe) - { - version(X86_64) - { - // This instruction not available in 32-bit x86. - return __builtin_ia32_bzhi_di(a, index); - } - else - return bzhi!ulong(a, index); - } - else - return bzhi!ulong(a, index); - } - else - { - return bzhi!ulong(a, index); - } -} -unittest -{ - static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); - assert (_bzhi_u64(0x1234_5678, 5) == 0x18); - static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); - assert (_bzhi_u64(0x1234_5678, 10) == 0x278); - static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); - assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); - static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); - assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); -} - -// Helper function for BZHI -private T bzhi(T)(T a, uint index) -{ - /+ - n := index[7:0] - dst := a - IF (n < number of bits) - dst[MSB:n] := 0 - FI - +/ - enum numbits = T.sizeof*8; - T dst = a; - if (index < numbits) - { - T mask = (T(1) << index) - 1; - dst &= mask; - } - return dst; -} - -/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, -/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags. -/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. -/// But, those particular semantics don't exist at the level of intrinsics. -uint _mulx_u32 (uint a, uint b, uint* hi) -{ - // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for - // some reason, even with LLVM IR. - // Also same with GDC. - ulong result = cast(ulong) a * b; - *hi = cast(uint) (result >>> 32); - return cast(uint)result; -} -@system unittest -{ - uint hi; - assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); - assert (hi == 0x014B_66DC); -} - -/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and -/// store the high 64-bits in `hi`. This does not read or write arithmetic flags. -/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. -/// But, those particular semantics don't exist at the level of intrinsics. -ulong _mulx_u64 (ulong a, ulong b, ulong* hi) -{ - /+ - dst[63:0] := (a * b)[63:0] - MEM[hi+63:hi] := (a * b)[127:64] - +/ - - static if (LDC_with_optimizations) - { - static if (__VERSION__ >= 2094) - enum bool withLDCIR = true; - else - enum bool withLDCIR = false; - } - else - { - enum bool withLDCIR = false; - } - - static if (withLDCIR) - { - // LDC x86: Generates mulx from -O0 - enum ir = ` - %4 = zext i64 %0 to i128 - %5 = zext i64 %1 to i128 - %6 = mul nuw i128 %5, %4 - %7 = lshr i128 %6, 64 - %8 = trunc i128 %7 to i64 - store i64 %8, i64* %2, align 8 - %9 = trunc i128 %6 to i64 - ret i64 %9`; - return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); - } - else - { - /+ Straight-forward implementation with `ucent`: - ucent result = cast(ucent) a * b; - *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); - return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); - +/ - - /+ - Implementation using 64bit math is more complex... - a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) - = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low - = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low - = c2 << 64 + c11 << 32 + c12 << 32 + c0 - = z1 << 64 + z0 - // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that - // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that - // intermediate result is then the 'carry' that we need to add when calculating z1's sum. - z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 - The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 - z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 - +/ - - const ulong a_low = a & 0xFFFF_FFFF; - const ulong a_high = a >>> 32; - const ulong b_low = b & 0xFFFF_FFFF; - const ulong b_high = b >>> 32; - - const ulong c2 = a_high*b_high; - const ulong c11 = a_high*b_low; - const ulong c12 = a_low*b_high; - const ulong c0 = a_low*b_low; - - const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); - const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); - const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); - - *hi = z1; - return z0; - } -} -@system unittest -{ - ulong hi; - // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 - assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); - assert (hi == 0x14b_66dc_33f6_acdc); -} - -/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. -uint _pdep_u32 (uint a, uint mask) -{ - static if (GDC_or_LDC_with_BMI2) - { - if (!__ctfe) - return __builtin_ia32_pdep_si(a, mask); - else - return pdep!uint(a, mask); - } - else - { - return pdep!uint(a, mask); - } -} -unittest -{ - static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); - assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); -} - -/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. -ulong _pdep_u64 (ulong a, ulong mask) -{ - static if (GDC_or_LDC_with_BMI2) - { - if (!__ctfe) - { - version(X86_64) - { - // This instruction not available in 32-bit x86. - return __builtin_ia32_pdep_di(a, mask); - } - else - return pdep!ulong(a, mask); - } - else - return pdep!ulong(a, mask); - } - else - { - return pdep!ulong(a, mask); - } -} -unittest -{ - static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); - assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); -} - -// Helper function for PDEP -private T pdep(T)(T a, T mask) -{ - /+ - tmp := a - dst := 0 - m := 0 - k := 0 - DO WHILE m < 32 - IF mask[m] == 1 - dst[m] := tmp[k] - k := k + 1 - FI - m := m + 1 - OD - +/ - T dst; - T k_bitpos = 1; - T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed - foreach (m; 0..T.sizeof*8) - { - if (mask & m_bitpos) - { - dst |= (a & k_bitpos) ? m_bitpos : 0; - k_bitpos <<= 1; - } - m_bitpos <<= 1; - } - return dst; -} - - -/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by -/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. -uint _pext_u32 (uint a, uint mask) -{ - static if (GDC_or_LDC_with_BMI2) - { - if (!__ctfe) - return __builtin_ia32_pext_si(a, mask); - else - return pext!uint(a, mask); - } - else - { - return pext!uint(a, mask); - } -} -unittest -{ - static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); - assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); -} - -/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by -/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. -ulong _pext_u64 (ulong a, ulong mask) -{ - static if (GDC_or_LDC_with_BMI2) - { - if (!__ctfe) - { - version(X86_64) - { - // This instruction not available in 32-bit x86. - return __builtin_ia32_pext_di(a, mask); - } - else - return pext!ulong(a, mask); - } - else - return pext!ulong(a, mask); - } - else - { - return pext!ulong(a, mask); - } -} -unittest -{ - static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); - assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); -} - -// Helper function for PEXT -private T pext(T)(T a, T mask) -{ - /+ - tmp := a - dst := 0 - m := 0 - k := 0 - DO WHILE m < number of bits in T - IF mask[m] == 1 - dst[k] := tmp[m] - k := k + 1 - FI - m := m + 1 - OD - +/ - T dst; - T k_bitpos = 1; - T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed - foreach (m; 0..T.sizeof*8) - { - if (mask & m_bitpos) - { - dst |= (a & m_bitpos) ? k_bitpos : 0; - k_bitpos <<= 1; - } - m_bitpos <<= 1; - } - return dst; -} +/** +* BMI2 intrinsics. +* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2 +* +* Copyright: Copyright Johan Engelen 2021. +* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) +*/ +module inteli.bmi2intrin; + +import inteli.internals; + +nothrow @nogc pure @safe: + +/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. +uint _bzhi_u32 (uint a, uint index) +{ + static if (GDC_or_LDC_with_BMI2) + { + if (!__ctfe) + return __builtin_ia32_bzhi_si(a, index); + else + return bzhi!uint(a, index); + } + else + { + return bzhi!uint(a, index); + } +} +unittest +{ + static assert (_bzhi_u32(0x1234_5678, 5) == 0x18); + assert (_bzhi_u32(0x1234_5678, 5) == 0x18); + static assert (_bzhi_u32(0x1234_5678, 10) == 0x278); + assert (_bzhi_u32(0x1234_5678, 10) == 0x278); + static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); + assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678); +} + +/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index. +ulong _bzhi_u64 (ulong a, uint index) +{ + static if (GDC_or_LDC_with_BMI2) + { + if (!__ctfe) + { + version(X86_64) + { + // This instruction not available in 32-bit x86. + return __builtin_ia32_bzhi_di(a, index); + } + else + return bzhi!ulong(a, index); + } + else + return bzhi!ulong(a, index); + } + else + { + return bzhi!ulong(a, index); + } +} +unittest +{ + static assert (_bzhi_u64(0x1234_5678, 5) == 0x18); + assert (_bzhi_u64(0x1234_5678, 5) == 0x18); + static assert (_bzhi_u64(0x1234_5678, 10) == 0x278); + assert (_bzhi_u64(0x1234_5678, 10) == 0x278); + static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); + assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678); + static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); + assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678); +} + +// Helper function for BZHI +private T bzhi(T)(T a, uint index) +{ + /+ + n := index[7:0] + dst := a + IF (n < number of bits) + dst[MSB:n] := 0 + FI + +/ + enum numbits = T.sizeof*8; + T dst = a; + if (index < numbits) + { + T mask = (T(1) << index) - 1; + dst &= mask; + } + return dst; +} + +/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, +/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags. +/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. +/// But, those particular semantics don't exist at the level of intrinsics. +uint _mulx_u32 (uint a, uint b, uint* hi) +{ + // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for + // some reason, even with LLVM IR. + // Also same with GDC. + ulong result = cast(ulong) a * b; + *hi = cast(uint) (result >>> 32); + return cast(uint)result; +} +@system unittest +{ + uint hi; + assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840); + assert (hi == 0x014B_66DC); +} + +/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and +/// store the high 64-bits in `hi`. This does not read or write arithmetic flags. +/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say. +/// But, those particular semantics don't exist at the level of intrinsics. +ulong _mulx_u64 (ulong a, ulong b, ulong* hi) +{ + /+ + dst[63:0] := (a * b)[63:0] + MEM[hi+63:hi] := (a * b)[127:64] + +/ + + static if (LDC_with_optimizations) + { + static if (__VERSION__ >= 2094) + enum bool withLDCIR = true; + else + enum bool withLDCIR = false; + } + else + { + enum bool withLDCIR = false; + } + + static if (withLDCIR) + { + // LDC x86: Generates mulx from -O0 + enum ir = ` + %4 = zext i64 %0 to i128 + %5 = zext i64 %1 to i128 + %6 = mul nuw i128 %5, %4 + %7 = lshr i128 %6, 64 + %8 = trunc i128 %7 to i64 + store i64 %8, i64* %2, align 8 + %9 = trunc i128 %6 to i64 + ret i64 %9`; + return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi); + } + else + { + /+ Straight-forward implementation with `ucent`: + ucent result = cast(ucent) a * b; + *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF); + return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF); + +/ + + /+ + Implementation using 64bit math is more complex... + a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low) + = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low + = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low + = c2 << 64 + c11 << 32 + c12 << 32 + c0 + = z1 << 64 + z0 + // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that + // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that + // intermediate result is then the 'carry' that we need to add when calculating z1's sum. + z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32 + The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 + z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32 + +/ + + const ulong a_low = a & 0xFFFF_FFFF; + const ulong a_high = a >>> 32; + const ulong b_low = b & 0xFFFF_FFFF; + const ulong b_high = b >>> 32; + + const ulong c2 = a_high*b_high; + const ulong c11 = a_high*b_low; + const ulong c12 = a_low*b_high; + const ulong c0 = a_low*b_low; + + const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF); + const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32); + const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32); + + *hi = z1; + return z0; + } +} +@system unittest +{ + ulong hi; + // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100 + assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100); + assert (hi == 0x14b_66dc_33f6_acdc); +} + +/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. +uint _pdep_u32 (uint a, uint mask) +{ + static if (GDC_or_LDC_with_BMI2) + { + if (!__ctfe) + return __builtin_ia32_pdep_si(a, mask); + else + return pdep!uint(a, mask); + } + else + { + return pdep!uint(a, mask); + } +} +unittest +{ + static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); + assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708); +} + +/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero. +ulong _pdep_u64 (ulong a, ulong mask) +{ + static if (GDC_or_LDC_with_BMI2) + { + if (!__ctfe) + { + version(X86_64) + { + // This instruction not available in 32-bit x86. + return __builtin_ia32_pdep_di(a, mask); + } + else + return pdep!ulong(a, mask); + } + else + return pdep!ulong(a, mask); + } + else + { + return pdep!ulong(a, mask); + } +} +unittest +{ + static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); + assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201); +} + +// Helper function for PDEP +private T pdep(T)(T a, T mask) +{ + /+ + tmp := a + dst := 0 + m := 0 + k := 0 + DO WHILE m < 32 + IF mask[m] == 1 + dst[m] := tmp[k] + k := k + 1 + FI + m := m + 1 + OD + +/ + T dst; + T k_bitpos = 1; + T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed + foreach (m; 0..T.sizeof*8) + { + if (mask & m_bitpos) + { + dst |= (a & k_bitpos) ? m_bitpos : 0; + k_bitpos <<= 1; + } + m_bitpos <<= 1; + } + return dst; +} + + +/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by +/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. +uint _pext_u32 (uint a, uint mask) +{ + static if (GDC_or_LDC_with_BMI2) + { + if (!__ctfe) + return __builtin_ia32_pext_si(a, mask); + else + return pext!uint(a, mask); + } + else + { + return pext!uint(a, mask); + } +} +unittest +{ + static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); + assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468); +} + +/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by +/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero. +ulong _pext_u64 (ulong a, ulong mask) +{ + static if (GDC_or_LDC_with_BMI2) + { + if (!__ctfe) + { + version(X86_64) + { + // This instruction not available in 32-bit x86. + return __builtin_ia32_pext_di(a, mask); + } + else + return pext!ulong(a, mask); + } + else + return pext!ulong(a, mask); + } + else + { + return pext!ulong(a, mask); + } +} +unittest +{ + static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); + assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531); +} + +// Helper function for PEXT +private T pext(T)(T a, T mask) +{ + /+ + tmp := a + dst := 0 + m := 0 + k := 0 + DO WHILE m < number of bits in T + IF mask[m] == 1 + dst[k] := tmp[m] + k := k + 1 + FI + m := m + 1 + OD + +/ + T dst; + T k_bitpos = 1; + T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed + foreach (m; 0..T.sizeof*8) + { + if (mask & m_bitpos) + { + dst |= (a & m_bitpos) ? k_bitpos : 0; + k_bitpos <<= 1; + } + m_bitpos <<= 1; + } + return dst; +} diff --git a/external/inteli/smmintrin.d b/external/inteli/smmintrin.d index 16fdb47..bf2f517 100644 --- a/external/inteli/smmintrin.d +++ b/external/inteli/smmintrin.d @@ -1,2215 +1,2215 @@ -/** -* SSE4.1 intrinsics. -* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 -* -* Copyright: Guillaume Piolat 2021. -* Johan Engelen 2021. -* cet 2024. -* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) -*/ -module inteli.smmintrin; - -// SSE4.1 instructions -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 -// Note: this header will work whether you have SSE4.1 enabled or not. -// With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively -// generate SSE4.1 instructions. -// With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. - -public import inteli.types; -import inteli.internals; - -// smmintrin pulls in all previous instruction set intrinsics. -public import inteli.tmmintrin; - -nothrow @nogc: - -enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes -enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto -enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto -enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto -enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto -enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto -enum int _MM_FROUND_NO_EXC = 0x08; /// ditto - -enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); -enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); -enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); -enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); -enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); -enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); - -/// Add packed signed 32-bit integers in `a` and `b` using saturation. -/// #BONUS -__m128i _mm_adds_epi32(__m128i a, __m128i b) pure -{ - // PERF: ARM64 should use 2x vqadd_s32 - static if (LDC_with_saturated_intrinsics) - return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b); - else - { - __m128i int_max = _mm_set1_epi32(0x7FFFFFFF); - __m128i res = _mm_add_epi32(a, b); - __m128i sign_bit = _mm_srli_epi32(a, 31); - __m128i sign_xor = _mm_xor_si128(a, b); - __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res)); - __m128i saturated = _mm_add_epi32(int_max, sign_bit); - return cast(__m128i) _mm_blendv_ps(cast(__m128)res, - cast(__m128)saturated, - cast(__m128)overflow); - } -} -unittest -{ - __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min); - __m128i b = _mm_setr_epi32(1, 2, 3, -4); - assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]); -} - -/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. -// Note: changed signature, GDC needs a compile-time value for imm8. -__m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16 - return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); - } - else - { - // LDC x86 This generates pblendw since LDC 1.1 and -O2 - short8 r; - short8 sa = cast(short8)a; - short8 sb = cast(short8)b; - for (int n = 0; n < 8; ++n) - { - r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; - } - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); - __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); - short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 - short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; - assert(C.array == correct); -} - - -/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. -// Note: changed signature, GDC needs a compile-time value for `imm8`. -__m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted -{ - static assert(imm8 >= 0 && imm8 < 4); - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); - } - else - { - // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 - double2 r; - for (int n = 0; n < 2; ++n) - { - r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; - } - return cast(__m128d)r; - } -} -unittest -{ - __m128d A = _mm_setr_pd(0, 1); - __m128d B = _mm_setr_pd(8, 9); - double2 C = _mm_blend_pd!2(A, B); - double[2] correct = [0, 9]; - assert(C.array == correct); -} - - -/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control -/// mask `imm8`. -// Note: changed signature, GDC needs a compile-time value for imm8. -__m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted -{ - // PERF DMD - static assert(imm8 >= 0 && imm8 < 16); - static if (GDC_with_SSE41) - { - return __builtin_ia32_blendps(a, b, imm8); - } - else version(LDC) - { - // LDC x86: generates blendps since LDC 1.1 -O2 - // arm64: pretty good, two instructions worst case - return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, - (imm8 & 2) ? 5 : 1, - (imm8 & 4) ? 6 : 2, - (imm8 & 8) ? 7 : 3)(a, b); - } - else - { - // PERF GDC without SSE4.1 is quite bad - __m128 r; - for (int n = 0; n < 4; ++n) - { - r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; - } - return r; - } -} -unittest -{ - __m128 A = _mm_setr_ps(0, 1, 2, 3); - __m128 B = _mm_setr_ps(8, 9, 10, 11); - float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 - float[4] correct = [8, 1, 10, 11]; - assert(C.array == correct); -} - -/// Blend packed 8-bit integers from `a` and `b` using `mask`. -/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`. -__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted -{ - // PERF DMD - /*static if (GDC_with_SSE41) - { - // This intrinsic do nothing in GDC 12. - // TODO report to GDC. No problem in GCC. - return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); - } - else*/ - static if (LDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); - } - else static if (LDC_with_ARM64) - { - // LDC arm64: two instructions since LDC 1.12 -O2 - byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); - return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); - } - else - { - __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); - return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); - } -} -unittest -{ - __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15); - __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31); - __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, - 1, 1, -1, -1, 4, 1, 8, -128); - byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); - byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, - 8, 9, 26, 27, 12, 13, 14, 31 ]; - assert(R.array == correct); -} - - -/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. -__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted -{ - // PERF DMD - static if (GDC_with_SSE42) - { - // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction - // with -msse4.2 but not -msse4.1. - // Not sure what is the reason, and there is a replacement sequence. - // Sounds like a bug. - return __builtin_ia32_blendvpd(a, b, mask); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_blendvpd(a, b, mask); - } - else static if (LDC_with_ARM64) - { - long2 shift; - shift = 63; - long2 lmask = cast(long2)mask >> shift; - return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); - } - else - { - __m128d r; // PERF =void; - long2 lmask = cast(long2)mask; - for (int n = 0; n < 2; ++n) - { - r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; - } - return r; - } -} -unittest -{ - __m128d A = _mm_setr_pd(1.0, 2.0); - __m128d B = _mm_setr_pd(3.0, 4.0); - __m128d M1 = _mm_setr_pd(-3.0, 2.0); - __m128d R1 = _mm_blendv_pd(A, B, M1); - double[2] correct1 = [3.0, 2.0]; - assert(R1.array == correct1); - - // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost - // See Issue #78 - __m128d M2 = _mm_setr_pd(double.nan, double.infinity); - __m128d R2 = _mm_blendv_pd(A, B, M2); - double[2] correct2 = [1.0, 2.0]; - assert(R2.array == correct2); -} - - -/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. -__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_blendvps(a, b, mask); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_blendvps(a, b, mask); - } - else static if (LDC_with_ARM64) - { - int4 shift; - shift = 31; - int4 lmask = cast(int4)mask >> shift; - return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); - } - else - { - // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2 - // If lack of optimization, consider replacing by: - // __m128i overflow_mask = _mm_srai_epi32(overflow, 31); - // return _mm_or_si128( - // _mm_and_si128(overflow_mask, saturated), - // _mm_andnot_si128(overflow_mask, res) - // LLVM makes almost the same sequence when optimized. - __m128 r; - int4 lmask = cast(int4)mask; - for (int n = 0; n < 4; ++n) - { - r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; - } - return r; - } -} -unittest -{ - __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); - __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); - __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); - __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); - __m128 R1 = _mm_blendv_ps(A, B, M1); - __m128 R2 = _mm_blendv_ps(A, B, M2); - float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; - float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; - assert(R1.array == correct1); - - // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost - // See Issue #78 - assert(R2.array == correct2); -} - -/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, -/// and store the results as packed double-precision floating-point elements. -__m128d _mm_ceil_pd (__m128d a) @trusted -{ - static if (LDC_with_ARM64) - { - // LDC arm64 acceptable since 1.8 -O2 - // Unfortunately x86 intrinsics force a round-trip back to double2 - // ARM neon semantics wouldn't have that - long2 l = vcvtpq_s64_f64(a); - double2 r; - r.ptr[0] = l.array[0]; - r.ptr[1] = l.array[1]; - return r; - } - else - { - return _mm_round_pd!2(a); - } -} -unittest -{ - __m128d A = _mm_setr_pd(1.3f, -2.12f); - __m128d B = _mm_setr_pd(53.6f, -2.7f); - A = _mm_ceil_pd(A); - B = _mm_ceil_pd(B); - double[2] correctA = [2.0, -2.0]; - double[2] correctB = [54.0, -2.0]; - assert(A.array == correctA); - assert(B.array == correctB); -} - -/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, -/// and store the results as packed single-precision floating-point elements. -__m128 _mm_ceil_ps (__m128 a) @trusted -{ - static if (LDC_with_ARM64) - { - // LDC arm64 acceptable since 1.8 -O1 - int4 l = vcvtpq_s32_f32(a); - float4 r; - r.ptr[0] = l.array[0]; - r.ptr[1] = l.array[1]; - r.ptr[2] = l.array[2]; - r.ptr[3] = l.array[3]; - return r; - } - else - { - return _mm_round_ps!2(a); - } -} -unittest -{ - __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); - __m128 C = _mm_ceil_ps(A); - float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; - assert(C.array == correct); -} - -/// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, -/// store the result as a double-precision floating-point element in the lower element of result, -/// and copy the upper element from `a` to the upper element of dst. -__m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted -{ - static if (LDC_with_ARM64) - { - a[0] = vcvtps_s64_f64(b[0]); - return a; - } - else - { - return _mm_round_sd!2(a, b); - } -} -unittest -{ - __m128d A = _mm_setr_pd(1.3, -2.12); - __m128d B = _mm_setr_pd(53.6, -3.7); - __m128d C = _mm_ceil_sd(A, B); - double[2] correct = [54.0, -2.12]; - assert(C.array == correct); -} - -/// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, -/// store the result as a single-precision floating-point element in the lower element of result, -/// and copy the upper 3 packed elements from `a` to the upper elements of result. -__m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted -{ - static if (LDC_with_ARM64) - { - a[0] = vcvtps_s32_f32(b[0]); - return a; - } - else - { - return _mm_round_ss!2(a, b); - } -} -unittest -{ - __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); - __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); - __m128 C = _mm_ceil_ss(A, B); - float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; - assert(C.array == correct); -} - -/// Compare packed 64-bit integers in `a` and `b` for equality. -__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted -{ - static if (SIMD_COMPARISON_MASKS_16B) - { - version(DigitalMars) - { - // DMD doesn't recognize long2 == long2 - long2 la = cast(long2)a; - long2 lb = cast(long2)b; - long2 res; - res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; - res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; - return cast(__m128i)res; - } - else - { - return cast(__m128i)(cast(long2)a == cast(long2)b); - } - } - else static if (GDC_with_SSE41) - { - return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); - } - else version(LDC) - { - // LDC x86: generates pcmpeqq since LDC 1.1 -O1 - // arm64: generates cmeq since LDC 1.8 -O1 - return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); - } - else - { - // Clever pcmpeqd + pand use with LDC 1.24 -O2 - long2 la = cast(long2)a; - long2 lb = cast(long2)b; - long2 res; - res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; - res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; - return cast(__m128i)res; - } -} -unittest -{ - __m128i A = _mm_setr_epi64(-1, -2); - __m128i B = _mm_setr_epi64(-3, -2); - __m128i C = _mm_setr_epi64(-1, -4); - long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); - long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); - long[2] correct1 = [0, -1]; - long[2] correct2 = [-1, 0]; - assert(AB.array == correct1); - assert(AC.array == correct2); -} - - -/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. -__m128i _mm_cvtepi16_epi32 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); - } - else static if (LDC_with_optimizations) - { - // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 - enum ir = ` - %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> - %r = sext <4 x i16> %v to <4 x i32> - ret <4 x i32> %r`; - return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); - } - else - { - short8 sa = cast(short8)a; - int4 r; - r.ptr[0] = sa.array[0]; - r.ptr[1] = sa.array[1]; - r.ptr[2] = sa.array[2]; - r.ptr[3] = sa.array[3]; - return r; - } -} -unittest -{ - __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); - int4 C = cast(int4) _mm_cvtepi16_epi32(A); - int[4] correct = [-1, 0, -32768, 32767]; - assert(C.array == correct); -} - -/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. -__m128i _mm_cvtepi16_epi64 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); - } - else static if (LDC_with_optimizations) - { - // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 - enum ir = ` - %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> - %r = sext <2 x i16> %v to <2 x i64> - ret <2 x i64> %r`; - return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); - } - else - { - short8 sa = cast(short8)a; - long2 r; - r.ptr[0] = sa.array[0]; - r.ptr[1] = sa.array[1]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); - long2 C = cast(long2) _mm_cvtepi16_epi64(A); - long[2] correct = [-32768, 32767]; - assert(C.array == correct); -} - -/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. -__m128i _mm_cvtepi32_epi64 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); - } - else static if (LDC_with_optimizations) - { - // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 - enum ir = ` - %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> - %r = sext <2 x i32> %v to <2 x i64> - ret <2 x i64> %r`; - return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); - } - else - { - int4 sa = cast(int4)a; - long2 r; - r.ptr[0] = sa.array[0]; - r.ptr[1] = sa.array[1]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi32(-4, 42, 0, 0); - long2 C = cast(long2) _mm_cvtepi32_epi64(A); - long[2] correct = [-4, 42]; - assert(C.array == correct); -} - - -/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. -__m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - alias ubyte16 = __vector(ubyte[16]); - return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); - } - else static if (LDC_with_optimizations) - { - // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 - // LDC ARM64: sshll generated since LDC 1.8.0 -O1 - enum ir = ` - %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> - %r = sext <8 x i8> %v to <8 x i16> - ret <8 x i16> %r`; - return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); - } - else - { - byte16 sa = cast(byte16)a; - short8 r; - foreach(n; 0..8) - r.ptr[n] = sa.array[n]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); - short8 C = cast(short8) _mm_cvtepi8_epi16(A); - short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; - assert(C.array == correct); -} - - -/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. -__m128i _mm_cvtepi8_epi32 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - alias ubyte16 = __vector(ubyte[16]); - return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); - } - else static if (LDC_with_SSE41 && LDC_with_optimizations) - { - // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 - enum ir = ` - %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> - %r = sext <4 x i8> %v to <4 x i32> - ret <4 x i32> %r`; - return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); - } - else - { - // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would - byte16 sa = cast(byte16)a; - int4 r; - r.ptr[0] = sa.array[0]; - r.ptr[1] = sa.array[1]; - r.ptr[2] = sa.array[2]; - r.ptr[3] = sa.array[3]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); - int4 C = cast(int4) _mm_cvtepi8_epi32(A); - int[4] correct = [127, -128, 1, -1]; - assert(C.array == correct); -} - - -/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. -__m128i _mm_cvtepi8_epi64 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - alias ubyte16 = __vector(ubyte[16]); - return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); - } - else static if (LDC_with_optimizations) - { - // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, - // LDC arm64: it's ok since LDC 1.8 -O1 - enum ir = ` - %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> - %r = sext <2 x i8> %v to <2 x i64> - ret <2 x i64> %r`; - return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); - } - else - { - byte16 sa = cast(byte16)a; - long2 r; - foreach(n; 0..2) - r.ptr[n] = sa.array[n]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); - long2 C = cast(long2) _mm_cvtepi8_epi64(A); - long[2] correct = [127, -128]; - assert(C.array == correct); -} - - -/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. -__m128i _mm_cvtepu16_epi32 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); - } - else - { - // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 - // arm64: ushll since LDC 1.12 -O1 - short8 sa = cast(short8)a; - int4 r; - r.ptr[0] = cast(ushort)sa.array[0]; - r.ptr[1] = cast(ushort)sa.array[1]; - r.ptr[2] = cast(ushort)sa.array[2]; - r.ptr[3] = cast(ushort)sa.array[3]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); - int4 C = cast(int4) _mm_cvtepu16_epi32(A); - int[4] correct = [65535, 0, 32768, 32767]; - assert(C.array == correct); -} - - -/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. -__m128i _mm_cvtepu16_epi64 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); - } - else static if (LDC_with_ARM64) - { - // LDC arm64: a bit shorter than below, in -O2 - short8 sa = cast(short8)a; - long2 r; - for(int n = 0; n < 2; ++n) - r.ptr[n] = cast(ushort)sa.array[n]; - return cast(__m128i)r; - } - else - { - // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 - short8 sa = cast(short8)a; - long2 r; - r.ptr[0] = cast(ushort)sa.array[0]; - r.ptr[1] = cast(ushort)sa.array[1]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); - long2 C = cast(long2) _mm_cvtepu16_epi64(A); - long[2] correct = [65535, 0]; - assert(C.array == correct); -} - - -/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. -__m128i _mm_cvtepu32_epi64 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); - } - else - { - // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 - // arm64: generates ushll since LDC 1.12 -O1 - int4 sa = cast(int4)a; - long2 r; - r.ptr[0] = cast(uint)sa.array[0]; - r.ptr[1] = cast(uint)sa.array[1]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi32(-1, 42, 0, 0); - long2 C = cast(long2) _mm_cvtepu32_epi64(A); - long[2] correct = [4294967295, 42]; - assert(C.array == correct); -} - - -/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. -__m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); - } - else static if (LDC_with_optimizations) - { - enum ir = ` - %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> - %r = zext <8 x i8> %v to <8 x i16> - ret <8 x i16> %r`; - return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); - } - else - { - return _mm_unpacklo_epi8(a, _mm_setzero_si128()); - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); - short8 C = cast(short8) _mm_cvtepu8_epi16(A); - short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; - assert(C.array == correct); -} - - -/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. -__m128i _mm_cvtepu8_epi32 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - alias ubyte16 = __vector(ubyte[16]); - return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); - } - else static if (LDC_with_ARM64) - { - // LDC arm64: a bit better than below in -O2 - byte16 sa = cast(byte16)a; - int4 r; - for(int n = 0; n < 4; ++n) - r.ptr[n] = cast(ubyte)sa.array[n]; - return cast(__m128i)r; - } - else - { - // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 - // PERF: catastrophic with GDC without SSE4.1 - byte16 sa = cast(byte16)a; - int4 r; - r.ptr[0] = cast(ubyte)sa.array[0]; - r.ptr[1] = cast(ubyte)sa.array[1]; - r.ptr[2] = cast(ubyte)sa.array[2]; - r.ptr[3] = cast(ubyte)sa.array[3]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); - int4 C = cast(int4) _mm_cvtepu8_epi32(A); - int[4] correct = [127, 128, 1, 255]; - assert(C.array == correct); -} - -/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. -__m128i _mm_cvtepu8_epi64 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - alias ubyte16 = __vector(ubyte[16]); - return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); - } - else static if (LDC_with_ARM64) - { - // LDC arm64: this optimizes better than the loop below - byte16 sa = cast(byte16)a; - long2 r; - for (int n = 0; n < 2; ++n) - r.ptr[n] = cast(ubyte)sa.array[n]; - return cast(__m128i)r; - } - else - { - // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 - byte16 sa = cast(byte16)a; - long2 r; - r.ptr[0] = cast(ubyte)sa.array[0]; - r.ptr[1] = cast(ubyte)sa.array[1]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); - long2 C = cast(long2) _mm_cvtepu8_epi64(A); - long[2] correct = [127, 254]; - assert(C.array == correct); -} - -/// Conditionally multiply the packed double-precision (64-bit) floating-point elements -/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally -/// store the sum in dst using the low 4 bits of `imm8`. -__m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_dppd(a, b, imm8 & 0x33); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_dppd(a, b, imm8 & 0x33); - } - else - { - __m128d zero = _mm_setzero_pd(); - __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); - double sum = temp.array[0] + temp.array[1]; - return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); - } -} -unittest -{ - __m128d A = _mm_setr_pd(1.0, 2.0); - __m128d B = _mm_setr_pd(4.0, 8.0); - double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); - double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); - double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); - double[2] correct1 = [ 4.0, 4.0]; - double[2] correct2 = [16.0, 0.0]; - double[2] correct3 = [ 0.0, 20.0]; - assert(R1.array == correct1); - assert(R2.array == correct2); - assert(R3.array == correct3); -} - -/// Conditionally multiply the packed single-precision (32-bit) floating-point elements -/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, -/// and conditionally store the sum in result using the low 4 bits of `imm8`. -__m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_dpps(a, b, cast(byte)imm8); - } - else - { - __m128 zero = _mm_setzero_ps(); - __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); - float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; - return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); - } -} -unittest -{ - __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); - __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); - float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); - float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); - float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); - float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; - float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; - float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; - assert(R1.array == correct1); - assert(R2.array == correct2); - assert(R3.array == correct3); -} - - -/// Extract a 32-bit integer from `a`, selected with `imm8`. -int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted -{ - return (cast(int4)a).array[imm8 & 3]; -} -unittest -{ - __m128i A = _mm_setr_epi32(1, 2, 3, 4); - assert(_mm_extract_epi32(A, 0) == 1); - assert(_mm_extract_epi32(A, 1 + 8) == 2); - assert(_mm_extract_epi32(A, 3 + 4) == 4); -} - -/// Extract a 64-bit integer from `a`, selected with `imm8`. -long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted -{ - long2 la = cast(long2)a; - return la.array[imm8 & 1]; -} -unittest -{ - __m128i A = _mm_setr_epi64(45, -67); - assert(_mm_extract_epi64(A, 0) == 45); - assert(_mm_extract_epi64(A, 1) == -67); - assert(_mm_extract_epi64(A, 2) == 45); -} - -/// Extract an 8-bit integer from `a`, selected with `imm8`. -/// Warning: the returned value is zero-extended to 32-bits. -int _mm_extract_epi8 (__m128i a, const int imm8) @trusted -{ - byte16 ba = cast(byte16)a; - return cast(ubyte) ba.array[imm8 & 15]; -} -unittest -{ - __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); - assert(_mm_extract_epi8(A, 7) == 7); - assert(_mm_extract_epi8(A, 13) == 255); - assert(_mm_extract_epi8(A, 7 + 16) == 7); -} - -/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. -/// Note: returns a 32-bit $(I integer). -int _mm_extract_ps (__m128 a, const int imm8) @trusted -{ - return (cast(int4)a).array[imm8 & 3]; -} -unittest -{ - __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); - assert(_mm_extract_ps(A, 0) == 0x3f800000); - assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); - assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); -} - - - -/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an -/// integer value, and store the results as packed double-precision floating-point elements. -__m128d _mm_floor_pd (__m128d a) @trusted -{ - static if (LDC_with_ARM64) - { - // LDC arm64 acceptable since 1.8 -O2 - long2 l = vcvtmq_s64_f64(a); - double2 r; - r.ptr[0] = l.array[0]; - r.ptr[1] = l.array[1]; - return r; - } - else - { - return _mm_round_pd!1(a); - } -} -unittest -{ - __m128d A = _mm_setr_pd(1.3f, -2.12f); - __m128d B = _mm_setr_pd(53.6f, -2.7f); - A = _mm_floor_pd(A); - B = _mm_floor_pd(B); - double[2] correctA = [1.0, -3.0]; - double[2] correctB = [53.0, -3.0]; - assert(A.array == correctA); - assert(B.array == correctB); -} - -/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an -/// integer value, and store the results as packed single-precision floating-point elements. -__m128 _mm_floor_ps (__m128 a) @trusted -{ - static if (LDC_with_ARM64) - { - // LDC arm64 acceptable since 1.8 -O1 - int4 l = vcvtmq_s32_f32(a); - float4 r; - r.ptr[0] = l.array[0]; - r.ptr[1] = l.array[1]; - r.ptr[2] = l.array[2]; - r.ptr[3] = l.array[3]; - return r; - } - else - { - return _mm_round_ps!1(a); - } -} -unittest -{ - __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); - __m128 C = _mm_floor_ps(A); - float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; - assert(C.array == correct); -} - -/// Round the lower double-precision (64-bit) floating-point element in `b` down to an -/// integer value, store the result as a double-precision floating-point element in the -/// lower element, and copy the upper element from `a` to the upper element. -__m128d _mm_floor_sd (__m128d a, __m128d b) @trusted -{ - static if (LDC_with_ARM64) - { - a[0] = vcvtms_s64_f64(b[0]); - return a; - } - else - { - return _mm_round_sd!1(a, b); - } -} -unittest -{ - __m128d A = _mm_setr_pd(1.3, -2.12); - __m128d B = _mm_setr_pd(-53.1, -3.7); - __m128d C = _mm_floor_sd(A, B); - double[2] correct = [-54.0, -2.12]; - assert(C.array == correct); -} - -/// Round the lower single-precision (32-bit) floating-point element in `b` down to an -/// integer value, store the result as a single-precision floating-point element in the -/// lower element, and copy the upper 3 packed elements from `a` to the upper elements. -__m128 _mm_floor_ss (__m128 a, __m128 b) @trusted -{ - static if (LDC_with_ARM64) - { - a[0] = vcvtms_s32_f32(b[0]); - return a; - } - else - { - return _mm_round_ss!1(a, b); - } -} -unittest -{ - __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); - __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); - __m128 C = _mm_floor_ss(A, B); - float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; - assert(C.array == correct); -} - -/// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. -__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted -{ - // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 - // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 - // LDC arm64: ins.s since LDC 1.8 -O2 - int4 ia = cast(int4)a; - ia.ptr[imm8 & 3] = i; - return cast(__m128i)ia; -} -unittest -{ - __m128i A = _mm_setr_epi32(1, 2, 3, 4); - int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); - int[4] result = [1, 2, 5, 4]; - assert(C.array == result); -} - -/// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. -__m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted -{ - // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 - // LDC x86: always do something sensible. - long2 la = cast(long2)a; - la.ptr[imm8 & 1] = i; - return cast(__m128i)la; -} -unittest -{ - __m128i A = _mm_setr_epi64(1, 2); - long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); - long[2] result = [1, 5]; - assert(C.array == result); -} - -/// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. -/// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. -__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted -{ - // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 - // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. - byte16 ba = cast(byte16)a; - ba.ptr[imm8 & 15] = cast(byte)i; - return cast(__m128i)ba; -} -unittest -{ - __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); - byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - assert(C.array == result); -} - - -/// Warning: of course it does something totally different from `_mm_insert_epi32`! -/// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` -/// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` -/// (elements are zeroed out when the corresponding bit is set). -__m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_insertps128(a, b, cast(byte)imm8); - } - else - { - float4 tmp2 = a; - float tmp1 = b.array[(imm8 >> 6) & 3]; - tmp2.ptr[(imm8 >> 4) & 3] = tmp1; - return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); - } -} -unittest -{ - __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); - __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); - __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); - float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; - assert(C.array == correct); -} - - -/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. -__m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted -{ - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); - } - else version(LDC) - { - // x86: pmaxsd since LDC 1.1 -O1 - // ARM: smax.4s since LDC 1.8 -01 - int4 sa = cast(int4)a; - int4 sb = cast(int4)b; - static if (SIMD_COMPARISON_MASKS_16B) - int4 greater = sa > sb; - else - int4 greater = greaterMask!int4(sa, sb); - return cast(__m128i)( (greater & sa) | (~greater & sb) ); - } - else - { - __m128i higher = _mm_cmpgt_epi32(a, b); - __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b - __m128i mask = _mm_and_si128(aTob, higher); - return _mm_xor_si128(b, mask); - } -} -unittest -{ - int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), - _mm_setr_epi32( -4,-8, 9, -8)); - int[4] correct = [0x7fffffff, 1, 9, 7]; - assert(R.array == correct); -} - -/// Compare packed signed 8-bit integers in `a` and `b`, -/// and return packed maximum values. -__m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); - } - else version(LDC) - { - // x86: pmaxsb since LDC 1.1 -O1 - // ARM64: smax.16b since LDC 1.8.0 -O1 - byte16 sa = cast(byte16)a; - byte16 sb = cast(byte16)b; - static if (SIMD_COMPARISON_MASKS_16B) - byte16 greater = sa > sb; - else - byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); - return cast(__m128i)( (greater & sa) | (~greater & sb) ); - } - else - { - __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else - __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b - __m128i mask = _mm_and_si128(aTob, lower); - return _mm_xor_si128(b, mask); - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); - __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - byte16 R = cast(byte16) _mm_max_epi8(A, B); - byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; - assert(R.array == correct); -} - -/// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. -__m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); - } - else version(LDC) - { - // x86: pmaxuw since LDC 1.1 -O1 - // ARM64: umax.8h since LDC 1.8.0 -O1 - // PERF: without sse4.1, LLVM 12 produces a very interesting - // psubusw xmm0, xmm1 - // paddw xmm0, xmm1 - // sequence that maybe should go in other min/max intrinsics? - ushort8 sa = cast(ushort8)a; - ushort8 sb = cast(ushort8)b; - static if (SIMD_COMPARISON_MASKS_16B) - { - // Note: doesn't work well with GDC, which prefers the builtin. - ushort8 greater = sa > sb; - } - else - ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); - return cast(__m128i)( (greater & sa) | (~greater & sb) ); - } - else - { - b = _mm_subs_epu16(b, a); - b = _mm_add_epi16(b, a); - return b; - } -} -unittest -{ - short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), - _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); - short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; - assert(R.array == correct); -} - -/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. -__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); - } - else version(LDC) - { - // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 - // ARM64: umax.4s since LDC 1.8.0 -O1 - uint4 sa = cast(uint4)a; - uint4 sb = cast(uint4)b; - static if (SIMD_COMPARISON_MASKS_16B) - uint4 greater = sa > sb; - else - uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); - return cast(__m128i)( (greater & sa) | (~greater & sb) ); - } - else - { - // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128" - /+ - movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000] - movdqa xmm3, xmm1 - pxor xmm3, xmm2 - pxor xmm2, xmm0 - pcmpgtd xmm2, xmm3 - pand xmm0, xmm2 - pandn xmm2, xmm1 - por xmm0, xmm2 - +/ - __m128i valueShift = _mm_set1_epi32(-0x80000000); - __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); - __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b - __m128i mask = _mm_and_si128(aTob, higher); - return _mm_xor_si128(b, mask); - } -} -unittest -{ - int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), - _mm_setr_epi32( -4,-8, 9, -8)); - int[4] correct = [ -4,-8, 9, -7]; - assert(R.array == correct); -} - -/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. -__m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); - } - else version(LDC) - { - // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 - // ARM: smin.4s since LDC 1.8 -01 - int4 sa = cast(int4)a; - int4 sb = cast(int4)b; - static if (SIMD_COMPARISON_MASKS_16B) - int4 greater = sa > sb; - else - int4 greater = greaterMask!int4(sa, sb); - return cast(__m128i)( (~greater & sa) | (greater & sb) ); - } - else - { - __m128i higher = _mm_cmplt_epi32(a, b); - __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b - __m128i mask = _mm_and_si128(aTob, higher); - return _mm_xor_si128(b, mask); - } -} -unittest -{ - int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), - _mm_setr_epi32( -4, -8, 9, -8)); - int[4] correct = [ -4, -8, -4, -8]; - assert(R.array == correct); -} - -/// Compare packed signed 8-bit integers in `a` and `b`, -/// and return packed minimum values. -__m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); - } - else version(LDC) - { - // x86: pminsb since LDC 1.1 -O1 - // ARM64: smin.16b since LDC 1.8.0 -O1 - byte16 sa = cast(byte16)a; - byte16 sb = cast(byte16)b; - static if (SIMD_COMPARISON_MASKS_16B) - byte16 greater = sa > sb; - else - byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); - return cast(__m128i)( (~greater & sa) | (greater & sb) ); - } - else - { - __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else - __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b - __m128i mask = _mm_and_si128(aTob, lower); - return _mm_xor_si128(b, mask); - } -} -unittest -{ - __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); - __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - byte16 R = cast(byte16) _mm_min_epi8(A, B); - byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - assert(R.array == correct); -} - -/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. -__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); - } - else version(LDC) - { - // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 - // ARM64: umin.8h since LDC 1.8.0 -O1 - ushort8 sa = cast(ushort8)a; - ushort8 sb = cast(ushort8)b; - static if (SIMD_COMPARISON_MASKS_16B) - ushort8 greater = (sb > sa); - else - ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); - return cast(__m128i)( (greater & sa) | (~greater & sb) ); - } - else - { - __m128i c = _mm_subs_epu16(b, a); - b = _mm_sub_epi16(b, c); - return b; - } -} -unittest -{ - short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), - _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); - short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; - assert(R.array == correct); -} - -/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. -__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); - } - else version(LDC) - { - // x86: pminud since LDC 1.1 -O1, also good without sse4.1 - // ARM64: umin.4s since LDC 1.8.0 -O1 - uint4 sa = cast(uint4)a; - uint4 sb = cast(uint4)b; - static if (SIMD_COMPARISON_MASKS_16B) - uint4 greater = sa > sb; - else - uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); - return cast(__m128i)( (~greater & sa) | (greater & sb) ); - } - else - { - // PERF: same remark as in _mm_max_epu32 - __m128i valueShift = _mm_set1_epi32(-0x80000000); - __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); - __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b - __m128i mask = _mm_and_si128(aTob, higher); - return _mm_xor_si128(b, mask); - } -} -unittest -{ - int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), - _mm_setr_epi32( -4,-8, 9, -8)); - int[4] correct = [0x7fffffff, 1, 4, -8]; - assert(R.array == correct); -} - -/// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, -/// store the minimum and index in return value, and zero the remaining bits. -__m128i _mm_minpos_epu16 (__m128i a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); - } - else static if (LDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); - } - else static if (LDC_with_ARM64) - { - __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); - __m128i combinedLo = _mm_unpacklo_epi16(indices, a); - __m128i combinedHi = _mm_unpackhi_epi16(indices, a); - __m128i best = _mm_min_epu32(combinedLo, combinedHi); - best = _mm_min_epu32(best, _mm_srli_si128!8(best)); - best = _mm_min_epu32(best, _mm_srli_si128!4(best)); - short8 sbest = cast(short8)best; - short8 r; - r[0] = sbest[1]; - r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie - r[2] = 0; - r[3] = 0; - r[4] = 0; - r[5] = 0; - r[6] = 0; - r[7] = 0; - return cast(__m128i)r; - } - else - { - short8 sa = cast(short8)a; - ushort min = 0xffff; - int index = 0; - for(int n = 0; n < 8; ++n) - { - ushort c = sa.array[n]; - if (c < min) - { - min = c; - index = n; - } - } - short8 r; - r.ptr[0] = min; - r.ptr[1] = cast(short)index; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); - __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); - short8 R1 = cast(short8) _mm_minpos_epu16(A); - short8 R2 = cast(short8) _mm_minpos_epu16(B); - short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; - short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; - assert(R1.array == correct1); - assert(R2.array == correct2); -} - -/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers -/// in `a` compared to those in `b`, and store the 16-bit results in dst. -/// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. -/// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. -/// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting -/// at the offset specified in `imm8[2]`. -__m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); - } - else static if (LDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); - } - else - { - int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... - int b_offset = (imm8 & 3) * 4; - - byte16 ba = cast(byte16)a; - byte16 bb = cast(byte16)b; - short8 r; - - __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); - - for (int j = 0; j < 8; j += 2) - { - int k = a_offset + j; - __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], - 0, 0, 0, 0, - ba[k+1], ba[k+2], ba[k+3], ba[k+4], - 0, 0, 0, 0); - short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 - r.ptr[j] = diffs.array[0]; - r.ptr[j+1] = diffs.array[4]; - } - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); - short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; - short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; - short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; - short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; - short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; - short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); - short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); - short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); - short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); - short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); - assert(r1.array == correct1); - assert(r4.array == correct4); - assert(r5.array == correct5); - assert(r7.array == correct7); - assert(r8.array == correct0); -} - -/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. -__m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); - } - else static if (LDC_with_SSE41 && LDC_with_optimizations) - { - // For some reason, clang has the builtin but it's not in IntrinsicsX86.td - // Use IR instead. - // This generates pmuldq with since LDC 1.2.0 -O0 - enum ir = ` - %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> - %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> - %la = sext <2 x i32> %ia to <2 x i64> - %lb = sext <2 x i32> %ib to <2 x i64> - %r = mul <2 x i64> %la, %lb - ret <2 x i64> %r`; - return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); - } - else static if (LDC_with_ARM64) - { - // 3 instructions since LDC 1.8 -O2 - // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull - int2 a_lo = vmovn_s64(cast(long2)a); - int2 b_lo = vmovn_s64(cast(long2)b); - return cast(__m128i) vmull_s32(a_lo, b_lo); - } - else - { - int4 ia = cast(int4)a; - int4 ib = cast(int4)b; - long2 r; - r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; - r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; - return cast(__m128i)r; - } -} -unittest -{ - __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); - __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); - long2 R = cast(long2) _mm_mul_epi32(A, B); - long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; - assert(R.array == correct); -} - -/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, -/// return the low 32 bits of the intermediate integers. -__m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - // PERF GDC without SSE4.1 could be better - static if (GDC_with_SSE41) - { - int4 ia = cast(int4)a; - int4 ib = cast(int4)b; - // Note: older GDC doesn't have that op, but older GDC - // also has no support for -msse4.1 detection - return cast(__m128i)(a * b); - } - else version(LDC) - { - int4 ia = cast(int4)a; - int4 ib = cast(int4)b; - return cast(__m128i)(a * b); - } - else - { - // DMD doesn't take the above - int4 ia = cast(int4)a; - int4 ib = cast(int4)b; - int4 r; - r.ptr[0] = ia.array[0] * ib.array[0]; - r.ptr[1] = ia.array[1] * ib.array[1]; - r.ptr[2] = ia.array[2] * ib.array[2]; - r.ptr[3] = ia.array[3] * ib.array[3]; - return r; - } -} -unittest -{ - __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); - __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); - int4 R = cast(int4) _mm_mullo_epi32(A, B); - int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; - assert(R.array == correct); -} - - -/// Convert packed signed 32-bit integers from `a` and `b` -/// to packed 16-bit integers using unsigned saturation. -__m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted -{ - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); - } - else static if (LDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); - } - else static if (LDC_with_ARM64) - { - int4 z; - z = 0; - return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), - vqmovn_u32(vmaxq_s32(z, cast(int4)b))); - } - else - { - __m128i i32768 = _mm_set1_epi32(32768); - __m128i s32768 = _mm_set1_epi16(-32768); - a = _mm_sub_epi32(a, i32768); - b = _mm_sub_epi32(b, i32768); - __m128i clampedSigned = _mm_packs_epi32(a, b); - return _mm_add_epi16(clampedSigned, s32768); - } -} -unittest -{ - __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); - short8 R = cast(short8) _mm_packus_epi32(A, A); - short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; - assert(R.array == correct); -} - - -/// Round the packed double-precision (64-bit) floating-point elements in `a` using the -/// rounding parameter, and store the results as packed double-precision floating-point elements. -/// Rounding is done according to the rounding[3:0] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE -__m128d _mm_round_pd(int rounding)(__m128d a) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_roundpd(a, rounding); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_roundpd(a, rounding); - } - else - { - static if (rounding & _MM_FROUND_CUR_DIRECTION) - { - // Convert to 64-bit integers - long lo = _mm_cvtsd_si64(a); - a.ptr[0] = a.array[1]; - long hi = _mm_cvtsd_si64(a); - return _mm_setr_pd(lo, hi); - } - else - { - version(GNU) pragma(inline, false); // else fail unittest with optimizations - - uint old = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE((rounding & 3) << 13); - - // Convert to 64-bit integers - long lo = _mm_cvtsd_si64(a); - a.ptr[0] = a.array[1]; - long hi = _mm_cvtsd_si64(a); - - // Convert back to double to achieve the rounding - // The problem is that a 64-bit double can't represent all the values - // a 64-bit integer can (and vice-versa). So this function won't work for - // large values. (MAYDO: what range exactly?) - _MM_SET_ROUNDING_MODE(old); - return _mm_setr_pd(lo, hi); - } - } -} -unittest -{ - // tested in other intrinsics -} - -/// Round the packed single-precision (32-bit) floating-point elements in `a` using the -/// rounding parameter, and store the results as packed single-precision floating-point elements. -/// Rounding is done according to the rounding[3:0] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE -__m128 _mm_round_ps(int rounding)(__m128 a) @trusted -{ - // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally - static if (GDC_or_LDC_with_SSE41) - { - return __builtin_ia32_roundps(a, rounding); - } - else - { - static if (rounding & _MM_FROUND_CUR_DIRECTION) - { - __m128i integers = _mm_cvtps_epi32(a); - return _mm_cvtepi32_ps(integers); - } - else - { - version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled - uint old = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE((rounding & 3) << 13); - scope(exit) _MM_SET_ROUNDING_MODE(old); - - // Convert to 64-bit integers - __m128i integers = _mm_cvtps_epi32(a); - - // Convert back to float to achieve the rounding - // The problem is that a 32-float can't represent all the values - // a 32-bit integer can (and vice-versa). So this function won't work for - // large values. (MAYDO: what range exactly?) - __m128 result = _mm_cvtepi32_ps(integers); - - return result; - } - } -} -unittest -{ - // tested in other intrinsics -} - - -/// Round the lower double-precision (64-bit) floating-point element in `b` using the -/// rounding parameter, store the result as a double-precision floating-point element -/// in the lower element of result, and copy the upper element from `a` to the upper element of result. -/// Rounding is done according to the rounding[3:0] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE -__m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted -{ - static if (GDC_with_SSE41) - { - return __builtin_ia32_roundsd(a, b, rounding); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_roundsd(a, b, rounding); - } - else - { - static if (rounding & _MM_FROUND_CUR_DIRECTION) - { - // Convert to 64-bit integer - long b0 = _mm_cvtsd_si64(b); - a.ptr[0] = b0; - return a; - } - else - { - version(GNU) pragma(inline, false); // else fail unittest with optimizations - - uint old = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE((rounding & 3) << 13); - - // Convert to 64-bit integer - long b0 = _mm_cvtsd_si64(b); - a.ptr[0] = b0; - - // Convert back to double to achieve the rounding - // The problem is that a 64-bit double can't represent all the values - // a 64-bit integer can (and vice-versa). So this function won't work for - // large values. (MAYDO: what range exactly?) - _MM_SET_ROUNDING_MODE(old); - return a; - } - } -} -unittest -{ - // tested in other intrinsics -} - - -/// Round the lower single-precision (32-bit) floating-point element in `b` using the -/// rounding parameter, store the result as a single-precision floating-point element -/// in the lower element of result, and copy the upper 3 packed elements from `a` -/// to the upper elements of result. -/// Rounding is done according to the rounding[3:0] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE -__m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted -{ - static if (GDC_with_SSE41) - { - return __builtin_ia32_roundss(a, b, rounding); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_roundss(a, b, rounding); - } - else - { - static if (rounding & _MM_FROUND_CUR_DIRECTION) - { - int b0 = _mm_cvtss_si32(b); - a.ptr[0] = b0; - return a; - } - else version(GNU) - { - pragma(inline, false) - __m128 GDCworkaround() nothrow @nogc @trusted - { - uint old = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE((rounding & 3) << 13); - - // Convert to 32-bit integer - int b0 = _mm_cvtss_si32(b); - a.ptr[0] = b0; - - // Convert back to double to achieve the rounding - // The problem is that a 32-bit float can't represent all the values - // a 32-bit integer can (and vice-versa). So this function won't work for - // large values. (MAYDO: what range exactly?) - _MM_SET_ROUNDING_MODE(old); - return a; - } - return GDCworkaround(); - } - else - { - uint old = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE((rounding & 3) << 13); - - // Convert to 32-bit integer - int b0 = _mm_cvtss_si32(b); - a.ptr[0] = b0; - - // Convert back to double to achieve the rounding - // The problem is that a 32-bit float can't represent all the values - // a 32-bit integer can (and vice-versa). So this function won't work for - // large values. (MAYDO: what range exactly?) - _MM_SET_ROUNDING_MODE(old); - return a; - } - } -} -unittest -{ - // tested in other intrinsics -} - - -/// Load 128-bits of integer data from memory using a non-temporal memory hint. -/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection -/// exception may be generated. -__m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted -{ - // PERF DMD D_SIMD - static if (GDC_with_SSE41) - { - return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); - } - else static if (LDC_with_InlineIREx && LDC_with_optimizations) - { - enum prefix = `!0 = !{ i32 1 }`; - enum ir = ` - %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 - ret <4 x i32> %r`; - return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr); - } - else - { - return *cast(__m128i*)mem_addr; // regular move instead - } -} -unittest -{ - align(16) static immutable int[4] correct = [1, 2, 3, 4]; - __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr)); - _mm_mfence(); - assert(A.array == correct); -} - -/// Return 1 if all bits in `a` are all 1's. Else return 0. -int _mm_test_all_ones (__m128i a) @safe -{ - return _mm_testc_si128(a, _mm_set1_epi32(-1)); -} -unittest -{ - __m128i A = _mm_set1_epi32(-1); - __m128i B = _mm_set_epi32(-1, -2, -1, -1); - assert(_mm_test_all_ones(A) == 1); - assert(_mm_test_all_ones(B) == 0); -} - -/// Return 1 if all bits in `a` are all 0's. Else return 0. -// This is a #BONUS since it was lacking in Intel Intrinsics API. -int _mm_test_all_zeros (__m128i a) @safe -{ - return _mm_testz_si128(a, _mm_set1_epi32(-1)); -} -unittest -{ - __m128i A = _mm_set1_epi32(0); - __m128i B = _mm_set_epi32(0, 8, 0, 0); - assert(_mm_test_all_zeros(A) == 1); - assert(_mm_test_all_zeros(B) == 0); -} - -/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, -/// and return 1 if the result is zero, otherwise return 0. -int _mm_test_all_zeros (__m128i a, __m128i mask) @safe -{ - return _mm_testz_si128(a, mask); // it's really the same, but with a good name -} - -/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 -/// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with -/// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and -/// CF values are zero, otherwise return 0. -int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted -{ - return _mm_testnzc_si128(a, mask); -} - -/// Compute the bitwise NOT of a and then AND with b, and return 1 if the -/// result is zero, otherwise return 0. -/// In other words, test if all bits masked by `b` are 1 in `a`. -int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); - } - else static if (LDC_with_ARM64) - { - // Acceptable since LDC 1.8 -02 - long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); - } - else - { - __m128i c = ~a & b; - int[4] zero = [0, 0, 0, 0]; - return c.array == zero; - } -} -unittest -{ - __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); - __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); - __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); - assert(_mm_testc_si128(A, A) == 1); - assert(_mm_testc_si128(A, M1) == 0); - assert(_mm_testc_si128(A, M2) == 1); -} - -/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, -/// and set ZF to 1 if the result is zero, otherwise set ZF to 0. -/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the -/// result is zero, otherwise set CF to 0. -/// Return 1 if both the ZF and CF values are zero, otherwise return 0. -int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); - } - else static if (LDC_with_ARM64) - { - long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); - long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); - - return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) - | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); - } - else - { - __m128i c = a & b; - __m128i d = ~a & b; - int[4] zero = [0, 0, 0, 0]; - return !( (c.array == zero) || (d.array == zero)); - } -} -unittest -{ - __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); - __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); - __m128i Z = _mm_setzero_si128(); - assert(_mm_testnzc_si128(A, Z) == 0); - assert(_mm_testnzc_si128(A, M) == 1); - assert(_mm_testnzc_si128(A, A) == 0); -} - -/// Compute the bitwise AND of 128 bits (representing integer data) in a and b, -/// and return 1 if the result is zero, otherwise return 0. -/// In other words, test if all bits masked by `b` are 0 in `a`. -int _mm_testz_si128 (__m128i a, __m128i b) @trusted -{ - // PERF DMD - static if (GDC_with_SSE41) - { - return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); - } - else static if (LDC_with_SSE41) - { - return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); - } - else static if (LDC_with_ARM64) - { - // Acceptable since LDC 1.8 -02 - long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); - } - else - { - __m128i c = a & b; - int[4] zero = [0, 0, 0, 0]; - return c.array == zero; - } -} -unittest -{ - __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); - __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); - __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); - assert(_mm_testz_si128(A, A) == 0); - assert(_mm_testz_si128(A, M1) == 1); - assert(_mm_testz_si128(A, M2) == 0); -} - +/** +* SSE4.1 intrinsics. +* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1 +* +* Copyright: Guillaume Piolat 2021. +* Johan Engelen 2021. +* cet 2024. +* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) +*/ +module inteli.smmintrin; + +// SSE4.1 instructions +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1 +// Note: this header will work whether you have SSE4.1 enabled or not. +// With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively +// generate SSE4.1 instructions. +// With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions. + +public import inteli.types; +import inteli.internals; + +// smmintrin pulls in all previous instruction set intrinsics. +public import inteli.tmmintrin; + +nothrow @nogc: + +enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes +enum int _MM_FROUND_TO_NEG_INF = 0x01; /// ditto +enum int _MM_FROUND_TO_POS_INF = 0x02; /// ditto +enum int _MM_FROUND_TO_ZERO = 0x03; /// ditto +enum int _MM_FROUND_CUR_DIRECTION = 0x04; /// ditto +enum int _MM_FROUND_RAISE_EXC = 0x00; /// ditto +enum int _MM_FROUND_NO_EXC = 0x08; /// ditto + +enum int _MM_FROUND_NINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); +enum int _MM_FROUND_FLOOR = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); +enum int _MM_FROUND_CEIL = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); +enum int _MM_FROUND_TRUNC = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); +enum int _MM_FROUND_RINT = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); +enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); + +/// Add packed signed 32-bit integers in `a` and `b` using saturation. +/// #BONUS +__m128i _mm_adds_epi32(__m128i a, __m128i b) pure +{ + // PERF: ARM64 should use 2x vqadd_s32 + static if (LDC_with_saturated_intrinsics) + return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b); + else + { + __m128i int_max = _mm_set1_epi32(0x7FFFFFFF); + __m128i res = _mm_add_epi32(a, b); + __m128i sign_bit = _mm_srli_epi32(a, 31); + __m128i sign_xor = _mm_xor_si128(a, b); + __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res)); + __m128i saturated = _mm_add_epi32(int_max, sign_bit); + return cast(__m128i) _mm_blendv_ps(cast(__m128)res, + cast(__m128)saturated, + cast(__m128)overflow); + } +} +unittest +{ + __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min); + __m128i b = _mm_setr_epi32(1, 2, 3, -4); + assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]); +} + +/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results. +// Note: changed signature, GDC needs a compile-time value for imm8. +__m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16 + return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8); + } + else + { + // LDC x86 This generates pblendw since LDC 1.1 and -O2 + short8 r; + short8 sa = cast(short8)a; + short8 sb = cast(short8)b; + for (int n = 0; n < 8; ++n) + { + r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n]; + } + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); + short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011 + short[8] correct = [8, 9, 2, 3, 12, 5, 6, 15]; + assert(C.array == correct); +} + + +/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`. +// Note: changed signature, GDC needs a compile-time value for `imm8`. +__m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted +{ + static assert(imm8 >= 0 && imm8 < 4); + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8); + } + else + { + // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12 + double2 r; + for (int n = 0; n < 2; ++n) + { + r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; + } + return cast(__m128d)r; + } +} +unittest +{ + __m128d A = _mm_setr_pd(0, 1); + __m128d B = _mm_setr_pd(8, 9); + double2 C = _mm_blend_pd!2(A, B); + double[2] correct = [0, 9]; + assert(C.array == correct); +} + + +/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control +/// mask `imm8`. +// Note: changed signature, GDC needs a compile-time value for imm8. +__m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted +{ + // PERF DMD + static assert(imm8 >= 0 && imm8 < 16); + static if (GDC_with_SSE41) + { + return __builtin_ia32_blendps(a, b, imm8); + } + else version(LDC) + { + // LDC x86: generates blendps since LDC 1.1 -O2 + // arm64: pretty good, two instructions worst case + return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0, + (imm8 & 2) ? 5 : 1, + (imm8 & 4) ? 6 : 2, + (imm8 & 8) ? 7 : 3)(a, b); + } + else + { + // PERF GDC without SSE4.1 is quite bad + __m128 r; + for (int n = 0; n < 4; ++n) + { + r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n]; + } + return r; + } +} +unittest +{ + __m128 A = _mm_setr_ps(0, 1, 2, 3); + __m128 B = _mm_setr_ps(8, 9, 10, 11); + float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101 + float[4] correct = [8, 1, 10, 11]; + assert(C.array == correct); +} + +/// Blend packed 8-bit integers from `a` and `b` using `mask`. +/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`. +__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted +{ + // PERF DMD + /*static if (GDC_with_SSE41) + { + // This intrinsic do nothing in GDC 12. + // TODO report to GDC. No problem in GCC. + return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask); + } + else*/ + static if (LDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask); + } + else static if (LDC_with_ARM64) + { + // LDC arm64: two instructions since LDC 1.12 -O2 + byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7); + return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a); + } + else + { + __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); + return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); + } +} +unittest +{ + __m128i A = _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); + __m128i M = _mm_setr_epi8( 1, -1, 1, 1, -4, 1, -8, 127, + 1, 1, -1, -1, 4, 1, 8, -128); + byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M); + byte[16] correct = [ 0, 17, 2, 3, 20, 5, 22, 7, + 8, 9, 26, 27, 12, 13, 14, 31 ]; + assert(R.array == correct); +} + + +/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`. +__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted +{ + // PERF DMD + static if (GDC_with_SSE42) + { + // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction + // with -msse4.2 but not -msse4.1. + // Not sure what is the reason, and there is a replacement sequence. + // Sounds like a bug. + return __builtin_ia32_blendvpd(a, b, mask); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_blendvpd(a, b, mask); + } + else static if (LDC_with_ARM64) + { + long2 shift; + shift = 63; + long2 lmask = cast(long2)mask >> shift; + return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a); + } + else + { + __m128d r; // PERF =void; + long2 lmask = cast(long2)mask; + for (int n = 0; n < 2; ++n) + { + r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; + } + return r; + } +} +unittest +{ + __m128d A = _mm_setr_pd(1.0, 2.0); + __m128d B = _mm_setr_pd(3.0, 4.0); + __m128d M1 = _mm_setr_pd(-3.0, 2.0); + __m128d R1 = _mm_blendv_pd(A, B, M1); + double[2] correct1 = [3.0, 2.0]; + assert(R1.array == correct1); + + // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost + // See Issue #78 + __m128d M2 = _mm_setr_pd(double.nan, double.infinity); + __m128d R2 = _mm_blendv_pd(A, B, M2); + double[2] correct2 = [1.0, 2.0]; + assert(R2.array == correct2); +} + + +/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`. +__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_blendvps(a, b, mask); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_blendvps(a, b, mask); + } + else static if (LDC_with_ARM64) + { + int4 shift; + shift = 31; + int4 lmask = cast(int4)mask >> shift; + return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a); + } + else + { + // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2 + // If lack of optimization, consider replacing by: + // __m128i overflow_mask = _mm_srai_epi32(overflow, 31); + // return _mm_or_si128( + // _mm_and_si128(overflow_mask, saturated), + // _mm_andnot_si128(overflow_mask, res) + // LLVM makes almost the same sequence when optimized. + __m128 r; + int4 lmask = cast(int4)mask; + for (int n = 0; n < 4; ++n) + { + r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n]; + } + return r; + } +} +unittest +{ + __m128 A = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f); + __m128 B = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f); + __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f); + __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f); + __m128 R1 = _mm_blendv_ps(A, B, M1); + __m128 R2 = _mm_blendv_ps(A, B, M2); + float[4] correct1 = [ 4.0f, 1.0f, 2.0f, 7.0f]; + float[4] correct2 = [ 0.0f, 1.0f, 6.0f, 3.0f]; + assert(R1.array == correct1); + + // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost + // See Issue #78 + assert(R2.array == correct2); +} + +/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, +/// and store the results as packed double-precision floating-point elements. +__m128d _mm_ceil_pd (__m128d a) @trusted +{ + static if (LDC_with_ARM64) + { + // LDC arm64 acceptable since 1.8 -O2 + // Unfortunately x86 intrinsics force a round-trip back to double2 + // ARM neon semantics wouldn't have that + long2 l = vcvtpq_s64_f64(a); + double2 r; + r.ptr[0] = l.array[0]; + r.ptr[1] = l.array[1]; + return r; + } + else + { + return _mm_round_pd!2(a); + } +} +unittest +{ + __m128d A = _mm_setr_pd(1.3f, -2.12f); + __m128d B = _mm_setr_pd(53.6f, -2.7f); + A = _mm_ceil_pd(A); + B = _mm_ceil_pd(B); + double[2] correctA = [2.0, -2.0]; + double[2] correctB = [54.0, -2.0]; + assert(A.array == correctA); + assert(B.array == correctB); +} + +/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, +/// and store the results as packed single-precision floating-point elements. +__m128 _mm_ceil_ps (__m128 a) @trusted +{ + static if (LDC_with_ARM64) + { + // LDC arm64 acceptable since 1.8 -O1 + int4 l = vcvtpq_s32_f32(a); + float4 r; + r.ptr[0] = l.array[0]; + r.ptr[1] = l.array[1]; + r.ptr[2] = l.array[2]; + r.ptr[3] = l.array[3]; + return r; + } + else + { + return _mm_round_ps!2(a); + } +} +unittest +{ + __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); + __m128 C = _mm_ceil_ps(A); + float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f]; + assert(C.array == correct); +} + +/// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, +/// store the result as a double-precision floating-point element in the lower element of result, +/// and copy the upper element from `a` to the upper element of dst. +__m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted +{ + static if (LDC_with_ARM64) + { + a[0] = vcvtps_s64_f64(b[0]); + return a; + } + else + { + return _mm_round_sd!2(a, b); + } +} +unittest +{ + __m128d A = _mm_setr_pd(1.3, -2.12); + __m128d B = _mm_setr_pd(53.6, -3.7); + __m128d C = _mm_ceil_sd(A, B); + double[2] correct = [54.0, -2.12]; + assert(C.array == correct); +} + +/// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value, +/// store the result as a single-precision floating-point element in the lower element of result, +/// and copy the upper 3 packed elements from `a` to the upper elements of result. +__m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted +{ + static if (LDC_with_ARM64) + { + a[0] = vcvtps_s32_f32(b[0]); + return a; + } + else + { + return _mm_round_ss!2(a, b); + } +} +unittest +{ + __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); + __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f); + __m128 C = _mm_ceil_ss(A, B); + float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f]; + assert(C.array == correct); +} + +/// Compare packed 64-bit integers in `a` and `b` for equality. +__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted +{ + static if (SIMD_COMPARISON_MASKS_16B) + { + version(DigitalMars) + { + // DMD doesn't recognize long2 == long2 + long2 la = cast(long2)a; + long2 lb = cast(long2)b; + long2 res; + res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; + res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; + return cast(__m128i)res; + } + else + { + return cast(__m128i)(cast(long2)a == cast(long2)b); + } + } + else static if (GDC_with_SSE41) + { + return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b); + } + else version(LDC) + { + // LDC x86: generates pcmpeqq since LDC 1.1 -O1 + // arm64: generates cmeq since LDC 1.8 -O1 + return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b); + } + else + { + // Clever pcmpeqd + pand use with LDC 1.24 -O2 + long2 la = cast(long2)a; + long2 lb = cast(long2)b; + long2 res; + res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0; + res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0; + return cast(__m128i)res; + } +} +unittest +{ + __m128i A = _mm_setr_epi64(-1, -2); + __m128i B = _mm_setr_epi64(-3, -2); + __m128i C = _mm_setr_epi64(-1, -4); + long2 AB = cast(long2) _mm_cmpeq_epi64(A, B); + long2 AC = cast(long2) _mm_cmpeq_epi64(A, C); + long[2] correct1 = [0, -1]; + long[2] correct2 = [-1, 0]; + assert(AB.array == correct1); + assert(AC.array == correct2); +} + + +/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers. +__m128i _mm_cvtepi16_epi32 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a); + } + else static if (LDC_with_optimizations) + { + // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64 + enum ir = ` + %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> + %r = sext <4 x i16> %v to <4 x i32> + ret <4 x i32> %r`; + return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a); + } + else + { + short8 sa = cast(short8)a; + int4 r; + r.ptr[0] = sa.array[0]; + r.ptr[1] = sa.array[1]; + r.ptr[2] = sa.array[2]; + r.ptr[3] = sa.array[3]; + return r; + } +} +unittest +{ + __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); + int4 C = cast(int4) _mm_cvtepi16_epi32(A); + int[4] correct = [-1, 0, -32768, 32767]; + assert(C.array == correct); +} + +/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers. +__m128i _mm_cvtepi16_epi64 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a); + } + else static if (LDC_with_optimizations) + { + // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64 + enum ir = ` + %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> + %r = sext <2 x i16> %v to <2 x i64> + ret <2 x i64> %r`; + return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a); + } + else + { + short8 sa = cast(short8)a; + long2 r; + r.ptr[0] = sa.array[0]; + r.ptr[1] = sa.array[1]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0); + long2 C = cast(long2) _mm_cvtepi16_epi64(A); + long[2] correct = [-32768, 32767]; + assert(C.array == correct); +} + +/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers. +__m128i _mm_cvtepi32_epi64 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a); + } + else static if (LDC_with_optimizations) + { + // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64 + enum ir = ` + %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> + %r = sext <2 x i32> %v to <2 x i64> + ret <2 x i64> %r`; + return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a); + } + else + { + int4 sa = cast(int4)a; + long2 r; + r.ptr[0] = sa.array[0]; + r.ptr[1] = sa.array[1]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi32(-4, 42, 0, 0); + long2 C = cast(long2) _mm_cvtepi32_epi64(A); + long[2] correct = [-4, 42]; + assert(C.array == correct); +} + + +/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers. +__m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + alias ubyte16 = __vector(ubyte[16]); + return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a); + } + else static if (LDC_with_optimizations) + { + // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 + // LDC ARM64: sshll generated since LDC 1.8.0 -O1 + enum ir = ` + %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> + %r = sext <8 x i8> %v to <8 x i16> + ret <8 x i16> %r`; + return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); + } + else + { + byte16 sa = cast(byte16)a; + short8 r; + foreach(n; 0..8) + r.ptr[n] = sa.array[n]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); + short8 C = cast(short8) _mm_cvtepi8_epi16(A); + short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8]; + assert(C.array == correct); +} + + +/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers. +__m128i _mm_cvtepi8_epi32 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + alias ubyte16 = __vector(ubyte[16]); + return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a); + } + else static if (LDC_with_SSE41 && LDC_with_optimizations) + { + // LDC x86: Generates pmovsxbd since LDC 1.1 -O0 + enum ir = ` + %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> + %r = sext <4 x i8> %v to <4 x i32> + ret <4 x i32> %r`; + return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a); + } + else + { + // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would + byte16 sa = cast(byte16)a; + int4 r; + r.ptr[0] = sa.array[0]; + r.ptr[1] = sa.array[1]; + r.ptr[2] = sa.array[2]; + r.ptr[3] = sa.array[3]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); + int4 C = cast(int4) _mm_cvtepi8_epi32(A); + int[4] correct = [127, -128, 1, -1]; + assert(C.array == correct); +} + + +/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. +__m128i _mm_cvtepi8_epi64 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + alias ubyte16 = __vector(ubyte[16]); + return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a); + } + else static if (LDC_with_optimizations) + { + // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, + // LDC arm64: it's ok since LDC 1.8 -O1 + enum ir = ` + %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> + %r = sext <2 x i8> %v to <2 x i64> + ret <2 x i64> %r`; + return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a); + } + else + { + byte16 sa = cast(byte16)a; + long2 r; + foreach(n; 0..2) + r.ptr[n] = sa.array[n]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); + long2 C = cast(long2) _mm_cvtepi8_epi64(A); + long[2] correct = [127, -128]; + assert(C.array == correct); +} + + +/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers. +__m128i _mm_cvtepu16_epi32 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a); + } + else + { + // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 + // arm64: ushll since LDC 1.12 -O1 + short8 sa = cast(short8)a; + int4 r; + r.ptr[0] = cast(ushort)sa.array[0]; + r.ptr[1] = cast(ushort)sa.array[1]; + r.ptr[2] = cast(ushort)sa.array[2]; + r.ptr[3] = cast(ushort)sa.array[3]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); + int4 C = cast(int4) _mm_cvtepu16_epi32(A); + int[4] correct = [65535, 0, 32768, 32767]; + assert(C.array == correct); +} + + +/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers. +__m128i _mm_cvtepu16_epi64 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a); + } + else static if (LDC_with_ARM64) + { + // LDC arm64: a bit shorter than below, in -O2 + short8 sa = cast(short8)a; + long2 r; + for(int n = 0; n < 2; ++n) + r.ptr[n] = cast(ushort)sa.array[n]; + return cast(__m128i)r; + } + else + { + // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1 + short8 sa = cast(short8)a; + long2 r; + r.ptr[0] = cast(ushort)sa.array[0]; + r.ptr[1] = cast(ushort)sa.array[1]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0); + long2 C = cast(long2) _mm_cvtepu16_epi64(A); + long[2] correct = [65535, 0]; + assert(C.array == correct); +} + + +/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers. +__m128i _mm_cvtepu32_epi64 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a); + } + else + { + // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1 + // arm64: generates ushll since LDC 1.12 -O1 + int4 sa = cast(int4)a; + long2 r; + r.ptr[0] = cast(uint)sa.array[0]; + r.ptr[1] = cast(uint)sa.array[1]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi32(-1, 42, 0, 0); + long2 C = cast(long2) _mm_cvtepu32_epi64(A); + long[2] correct = [4294967295, 42]; + assert(C.array == correct); +} + + +/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers. +__m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a); + } + else static if (LDC_with_optimizations) + { + enum ir = ` + %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> + %r = zext <8 x i8> %v to <8 x i16> + ret <8 x i16> %r`; + return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a); + } + else + { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); + short8 C = cast(short8) _mm_cvtepu8_epi16(A); + short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248]; + assert(C.array == correct); +} + + +/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers. +__m128i _mm_cvtepu8_epi32 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + alias ubyte16 = __vector(ubyte[16]); + return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a); + } + else static if (LDC_with_ARM64) + { + // LDC arm64: a bit better than below in -O2 + byte16 sa = cast(byte16)a; + int4 r; + for(int n = 0; n < 4; ++n) + r.ptr[n] = cast(ubyte)sa.array[n]; + return cast(__m128i)r; + } + else + { + // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1 + // PERF: catastrophic with GDC without SSE4.1 + byte16 sa = cast(byte16)a; + int4 r; + r.ptr[0] = cast(ubyte)sa.array[0]; + r.ptr[1] = cast(ubyte)sa.array[1]; + r.ptr[2] = cast(ubyte)sa.array[2]; + r.ptr[3] = cast(ubyte)sa.array[3]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); + int4 C = cast(int4) _mm_cvtepu8_epi32(A); + int[4] correct = [127, 128, 1, 255]; + assert(C.array == correct); +} + +/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers. +__m128i _mm_cvtepu8_epi64 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + alias ubyte16 = __vector(ubyte[16]); + return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a); + } + else static if (LDC_with_ARM64) + { + // LDC arm64: this optimizes better than the loop below + byte16 sa = cast(byte16)a; + long2 r; + for (int n = 0; n < 2; ++n) + r.ptr[n] = cast(ubyte)sa.array[n]; + return cast(__m128i)r; + } + else + { + // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1 + byte16 sa = cast(byte16)a; + long2 r; + r.ptr[0] = cast(ubyte)sa.array[0]; + r.ptr[1] = cast(ubyte)sa.array[1]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0); + long2 C = cast(long2) _mm_cvtepu8_epi64(A); + long[2] correct = [127, 254]; + assert(C.array == correct); +} + +/// Conditionally multiply the packed double-precision (64-bit) floating-point elements +/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally +/// store the sum in dst using the low 4 bits of `imm8`. +__m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_dppd(a, b, imm8 & 0x33); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_dppd(a, b, imm8 & 0x33); + } + else + { + __m128d zero = _mm_setzero_pd(); + __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b); + double sum = temp.array[0] + temp.array[1]; + return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum)); + } +} +unittest +{ + __m128d A = _mm_setr_pd(1.0, 2.0); + __m128d B = _mm_setr_pd(4.0, 8.0); + double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B); + double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B); + double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B); + double[2] correct1 = [ 4.0, 4.0]; + double[2] correct2 = [16.0, 0.0]; + double[2] correct3 = [ 0.0, 20.0]; + assert(R1.array == correct1); + assert(R2.array == correct2); + assert(R3.array == correct3); +} + +/// Conditionally multiply the packed single-precision (32-bit) floating-point elements +/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, +/// and conditionally store the sum in result using the low 4 bits of `imm8`. +__m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_dpps(a, b, cast(ubyte)imm8); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_dpps(a, b, cast(byte)imm8); + } + else + { + __m128 zero = _mm_setzero_ps(); + __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b); + float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3]; + return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum)); + } +} +unittest +{ + __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f); + __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f); + float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B); + float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B); + float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B); + float[4] correct1 = [67.0f, 67.0f, 67.0f, 67.0f]; + float[4] correct2 = [23.0f, 0.0f, 23.0f, 0.0f]; + float[4] correct3 = [0.0f, 29.0f, 0.0f, 29.0f]; + assert(R1.array == correct1); + assert(R2.array == correct2); + assert(R3.array == correct3); +} + + +/// Extract a 32-bit integer from `a`, selected with `imm8`. +int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted +{ + return (cast(int4)a).array[imm8 & 3]; +} +unittest +{ + __m128i A = _mm_setr_epi32(1, 2, 3, 4); + assert(_mm_extract_epi32(A, 0) == 1); + assert(_mm_extract_epi32(A, 1 + 8) == 2); + assert(_mm_extract_epi32(A, 3 + 4) == 4); +} + +/// Extract a 64-bit integer from `a`, selected with `imm8`. +long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted +{ + long2 la = cast(long2)a; + return la.array[imm8 & 1]; +} +unittest +{ + __m128i A = _mm_setr_epi64(45, -67); + assert(_mm_extract_epi64(A, 0) == 45); + assert(_mm_extract_epi64(A, 1) == -67); + assert(_mm_extract_epi64(A, 2) == 45); +} + +/// Extract an 8-bit integer from `a`, selected with `imm8`. +/// Warning: the returned value is zero-extended to 32-bits. +int _mm_extract_epi8 (__m128i a, const int imm8) @trusted +{ + byte16 ba = cast(byte16)a; + return cast(ubyte) ba.array[imm8 & 15]; +} +unittest +{ + __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15); + assert(_mm_extract_epi8(A, 7) == 7); + assert(_mm_extract_epi8(A, 13) == 255); + assert(_mm_extract_epi8(A, 7 + 16) == 7); +} + +/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`. +/// Note: returns a 32-bit $(I integer). +int _mm_extract_ps (__m128 a, const int imm8) @trusted +{ + return (cast(int4)a).array[imm8 & 3]; +} +unittest +{ + __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f); + assert(_mm_extract_ps(A, 0) == 0x3f800000); + assert(_mm_extract_ps(A, 1 + 8) == 0x40000000); + assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000); +} + + + +/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an +/// integer value, and store the results as packed double-precision floating-point elements. +__m128d _mm_floor_pd (__m128d a) @trusted +{ + static if (LDC_with_ARM64) + { + // LDC arm64 acceptable since 1.8 -O2 + long2 l = vcvtmq_s64_f64(a); + double2 r; + r.ptr[0] = l.array[0]; + r.ptr[1] = l.array[1]; + return r; + } + else + { + return _mm_round_pd!1(a); + } +} +unittest +{ + __m128d A = _mm_setr_pd(1.3f, -2.12f); + __m128d B = _mm_setr_pd(53.6f, -2.7f); + A = _mm_floor_pd(A); + B = _mm_floor_pd(B); + double[2] correctA = [1.0, -3.0]; + double[2] correctB = [53.0, -3.0]; + assert(A.array == correctA); + assert(B.array == correctB); +} + +/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an +/// integer value, and store the results as packed single-precision floating-point elements. +__m128 _mm_floor_ps (__m128 a) @trusted +{ + static if (LDC_with_ARM64) + { + // LDC arm64 acceptable since 1.8 -O1 + int4 l = vcvtmq_s32_f32(a); + float4 r; + r.ptr[0] = l.array[0]; + r.ptr[1] = l.array[1]; + r.ptr[2] = l.array[2]; + r.ptr[3] = l.array[3]; + return r; + } + else + { + return _mm_round_ps!1(a); + } +} +unittest +{ + __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f); + __m128 C = _mm_floor_ps(A); + float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f]; + assert(C.array == correct); +} + +/// Round the lower double-precision (64-bit) floating-point element in `b` down to an +/// integer value, store the result as a double-precision floating-point element in the +/// lower element, and copy the upper element from `a` to the upper element. +__m128d _mm_floor_sd (__m128d a, __m128d b) @trusted +{ + static if (LDC_with_ARM64) + { + a[0] = vcvtms_s64_f64(b[0]); + return a; + } + else + { + return _mm_round_sd!1(a, b); + } +} +unittest +{ + __m128d A = _mm_setr_pd(1.3, -2.12); + __m128d B = _mm_setr_pd(-53.1, -3.7); + __m128d C = _mm_floor_sd(A, B); + double[2] correct = [-54.0, -2.12]; + assert(C.array == correct); +} + +/// Round the lower single-precision (32-bit) floating-point element in `b` down to an +/// integer value, store the result as a single-precision floating-point element in the +/// lower element, and copy the upper 3 packed elements from `a` to the upper elements. +__m128 _mm_floor_ss (__m128 a, __m128 b) @trusted +{ + static if (LDC_with_ARM64) + { + a[0] = vcvtms_s32_f32(b[0]); + return a; + } + else + { + return _mm_round_ss!1(a, b); + } +} +unittest +{ + __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f); + __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f); + __m128 C = _mm_floor_ss(A, B); + float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f]; + assert(C.array == correct); +} + +/// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`. +__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted +{ + // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1 + // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1 + // LDC arm64: ins.s since LDC 1.8 -O2 + int4 ia = cast(int4)a; + ia.ptr[imm8 & 3] = i; + return cast(__m128i)ia; +} +unittest +{ + __m128i A = _mm_setr_epi32(1, 2, 3, 4); + int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4); + int[4] result = [1, 2, 5, 4]; + assert(C.array == result); +} + +/// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`. +__m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted +{ + // GDC: nothing special to do, psinrq generated with -O1 -msse4.1 + // LDC x86: always do something sensible. + long2 la = cast(long2)a; + la.ptr[imm8 & 1] = i; + return cast(__m128i)la; +} +unittest +{ + __m128i A = _mm_setr_epi64(1, 2); + long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2); + long[2] result = [1, 5]; + assert(C.array == result); +} + +/// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`. +/// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8. +__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted +{ + // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1 + // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory. + byte16 ba = cast(byte16)a; + ba.ptr[imm8 & 15] = cast(byte)i; + return cast(__m128i)ba; +} +unittest +{ + __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16); + byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + assert(C.array == result); +} + + +/// Warning: of course it does something totally different from `_mm_insert_epi32`! +/// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` +/// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` +/// (elements are zeroed out when the corresponding bit is set). +__m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_insertps128(a, b, cast(byte)imm8); + } + else + { + float4 tmp2 = a; + float tmp1 = b.array[(imm8 >> 6) & 3]; + tmp2.ptr[(imm8 >> 4) & 3] = tmp1; + return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps()); + } +} +unittest +{ + __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f); + __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f); + __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B); + float[4] correct = [1.0f, 2.0f, 0.0f, 7.0f]; + assert(C.array == correct); +} + + +/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. +__m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted +{ + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b); + } + else version(LDC) + { + // x86: pmaxsd since LDC 1.1 -O1 + // ARM: smax.4s since LDC 1.8 -01 + int4 sa = cast(int4)a; + int4 sb = cast(int4)b; + static if (SIMD_COMPARISON_MASKS_16B) + int4 greater = sa > sb; + else + int4 greater = greaterMask!int4(sa, sb); + return cast(__m128i)( (greater & sa) | (~greater & sb) ); + } + else + { + __m128i higher = _mm_cmpgt_epi32(a, b); + __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b + __m128i mask = _mm_and_si128(aTob, higher); + return _mm_xor_si128(b, mask); + } +} +unittest +{ + int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), + _mm_setr_epi32( -4,-8, 9, -8)); + int[4] correct = [0x7fffffff, 1, 9, 7]; + assert(R.array == correct); +} + +/// Compare packed signed 8-bit integers in `a` and `b`, +/// and return packed maximum values. +__m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b); + } + else version(LDC) + { + // x86: pmaxsb since LDC 1.1 -O1 + // ARM64: smax.16b since LDC 1.8.0 -O1 + byte16 sa = cast(byte16)a; + byte16 sb = cast(byte16)b; + static if (SIMD_COMPARISON_MASKS_16B) + byte16 greater = sa > sb; + else + byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); + return cast(__m128i)( (greater & sa) | (~greater & sb) ); + } + else + { + __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else + __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b + __m128i mask = _mm_and_si128(aTob, lower); + return _mm_xor_si128(b, mask); + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); + __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + byte16 R = cast(byte16) _mm_max_epi8(A, B); + byte[16] correct = [127, 1, 9, -7, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0]; + assert(R.array == correct); +} + +/// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values. +__m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b); + } + else version(LDC) + { + // x86: pmaxuw since LDC 1.1 -O1 + // ARM64: umax.8h since LDC 1.8.0 -O1 + // PERF: without sse4.1, LLVM 12 produces a very interesting + // psubusw xmm0, xmm1 + // paddw xmm0, xmm1 + // sequence that maybe should go in other min/max intrinsics? + ushort8 sa = cast(ushort8)a; + ushort8 sb = cast(ushort8)b; + static if (SIMD_COMPARISON_MASKS_16B) + { + // Note: doesn't work well with GDC, which prefers the builtin. + ushort8 greater = sa > sb; + } + else + ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb); + return cast(__m128i)( (greater & sa) | (~greater & sb) ); + } + else + { + b = _mm_subs_epu16(b, a); + b = _mm_add_epi16(b, a); + return b; + } +} +unittest +{ + short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), + _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); + short[8] correct = [ -4, -8, -4, -7, 9,-32768, 0, 57]; + assert(R.array == correct); +} + +/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values. +__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b); + } + else version(LDC) + { + // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1 + // ARM64: umax.4s since LDC 1.8.0 -O1 + uint4 sa = cast(uint4)a; + uint4 sb = cast(uint4)b; + static if (SIMD_COMPARISON_MASKS_16B) + uint4 greater = sa > sb; + else + uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); + return cast(__m128i)( (greater & sa) | (~greater & sb) ); + } + else + { + // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128" + /+ + movdqa xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000] + movdqa xmm3, xmm1 + pxor xmm3, xmm2 + pxor xmm2, xmm0 + pcmpgtd xmm2, xmm3 + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + +/ + __m128i valueShift = _mm_set1_epi32(-0x80000000); + __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift)); + __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b + __m128i mask = _mm_and_si128(aTob, higher); + return _mm_xor_si128(b, mask); + } +} +unittest +{ + int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), + _mm_setr_epi32( -4,-8, 9, -8)); + int[4] correct = [ -4,-8, 9, -7]; + assert(R.array == correct); +} + +/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values. +__m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b); + } + else version(LDC) + { + // x86: pminsd since LDC 1.1 -O1, also good without sse4.1 + // ARM: smin.4s since LDC 1.8 -01 + int4 sa = cast(int4)a; + int4 sb = cast(int4)b; + static if (SIMD_COMPARISON_MASKS_16B) + int4 greater = sa > sb; + else + int4 greater = greaterMask!int4(sa, sb); + return cast(__m128i)( (~greater & sa) | (greater & sb) ); + } + else + { + __m128i higher = _mm_cmplt_epi32(a, b); + __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b + __m128i mask = _mm_and_si128(aTob, higher); + return _mm_xor_si128(b, mask); + } +} +unittest +{ + int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7), + _mm_setr_epi32( -4, -8, 9, -8)); + int[4] correct = [ -4, -8, -4, -8]; + assert(R.array == correct); +} + +/// Compare packed signed 8-bit integers in `a` and `b`, +/// and return packed minimum values. +__m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b); + } + else version(LDC) + { + // x86: pminsb since LDC 1.1 -O1 + // ARM64: smin.16b since LDC 1.8.0 -O1 + byte16 sa = cast(byte16)a; + byte16 sb = cast(byte16)b; + static if (SIMD_COMPARISON_MASKS_16B) + byte16 greater = sa > sb; + else + byte16 greater = cast(byte16) greaterMask!byte16(sa, sb); + return cast(__m128i)( (~greater & sa) | (greater & sb) ); + } + else + { + __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else + __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b + __m128i mask = _mm_and_si128(aTob, lower); + return _mm_xor_si128(b, mask); + } +} +unittest +{ + __m128i A = _mm_setr_epi8(127, 1, -4, -8, 9, 7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0); + __m128i B = _mm_setr_epi8( 4, -8, 9, -7, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + byte16 R = cast(byte16) _mm_min_epi8(A, B); + byte[16] correct = [ 4, -8, -4, -8, 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + assert(R.array == correct); +} + +/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst. +__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b); + } + else version(LDC) + { + // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1 + // ARM64: umin.8h since LDC 1.8.0 -O1 + ushort8 sa = cast(ushort8)a; + ushort8 sb = cast(ushort8)b; + static if (SIMD_COMPARISON_MASKS_16B) + ushort8 greater = (sb > sa); + else + ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa); + return cast(__m128i)( (greater & sa) | (~greater & sb) ); + } + else + { + __m128i c = _mm_subs_epu16(b, a); + b = _mm_sub_epi16(b, c); + return b; + } +} +unittest +{ + short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767, 1, -4, -8, 9, 7, 0, 57), + _mm_setr_epi16( -4, -8, 9, -7, 0,-32768, 0, 0)); + short[8] correct = [32767, 1, 9, -8, 0, 7, 0, 0]; + assert(R.array == correct); +} + +/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst. +__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b); + } + else version(LDC) + { + // x86: pminud since LDC 1.1 -O1, also good without sse4.1 + // ARM64: umin.4s since LDC 1.8.0 -O1 + uint4 sa = cast(uint4)a; + uint4 sb = cast(uint4)b; + static if (SIMD_COMPARISON_MASKS_16B) + uint4 greater = sa > sb; + else + uint4 greater = cast(uint4) greaterMask!uint4(sa, sb); + return cast(__m128i)( (~greater & sa) | (greater & sb) ); + } + else + { + // PERF: same remark as in _mm_max_epu32 + __m128i valueShift = _mm_set1_epi32(-0x80000000); + __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift)); + __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b + __m128i mask = _mm_and_si128(aTob, higher); + return _mm_xor_si128(b, mask); + } +} +unittest +{ + int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1, 4, -7), + _mm_setr_epi32( -4,-8, 9, -8)); + int[4] correct = [0x7fffffff, 1, 4, -8]; + assert(R.array == correct); +} + +/// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, +/// store the minimum and index in return value, and zero the remaining bits. +__m128i _mm_minpos_epu16 (__m128i a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); + } + else static if (LDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a); + } + else static if (LDC_with_ARM64) + { + __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + __m128i combinedLo = _mm_unpacklo_epi16(indices, a); + __m128i combinedHi = _mm_unpackhi_epi16(indices, a); + __m128i best = _mm_min_epu32(combinedLo, combinedHi); + best = _mm_min_epu32(best, _mm_srli_si128!8(best)); + best = _mm_min_epu32(best, _mm_srli_si128!4(best)); + short8 sbest = cast(short8)best; + short8 r; + r[0] = sbest[1]; + r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie + r[2] = 0; + r[3] = 0; + r[4] = 0; + r[5] = 0; + r[6] = 0; + r[7] = 0; + return cast(__m128i)r; + } + else + { + short8 sa = cast(short8)a; + ushort min = 0xffff; + int index = 0; + for(int n = 0; n < 8; ++n) + { + ushort c = sa.array[n]; + if (c < min) + { + min = c; + index = n; + } + } + short8 r; + r.ptr[0] = min; + r.ptr[1] = cast(short)index; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6); + __m128i B = _mm_setr_epi16(14, 4, 4, 2, -3, 2, 5, 6); + short8 R1 = cast(short8) _mm_minpos_epu16(A); + short8 R2 = cast(short8) _mm_minpos_epu16(B); + short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0]; + short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0]; + assert(R1.array == correct1); + assert(R2.array == correct2); +} + +/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers +/// in `a` compared to those in `b`, and store the 16-bit results in dst. +/// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. +/// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. +/// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting +/// at the offset specified in `imm8[2]`. +__m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8); + } + else static if (LDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8); + } + else + { + int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable... + int b_offset = (imm8 & 3) * 4; + + byte16 ba = cast(byte16)a; + byte16 bb = cast(byte16)b; + short8 r; + + __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0); + + for (int j = 0; j < 8; j += 2) + { + int k = a_offset + j; + __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3], + 0, 0, 0, 0, + ba[k+1], ba[k+2], ba[k+3], ba[k+4], + 0, 0, 0, 0); + short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64 + r.ptr[j] = diffs.array[0]; + r.ptr[j+1] = diffs.array[4]; + } + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5, 5, 5, 12, 13, 14, 15); + short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23]; + short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749]; + short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35]; + short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741]; + short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4]; + short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B); + short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B); + short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B); + short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B); + short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B); + assert(r1.array == correct1); + assert(r4.array == correct4); + assert(r5.array == correct5); + assert(r7.array == correct7); + assert(r8.array == correct0); +} + +/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst. +__m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b); + } + else static if (LDC_with_SSE41 && LDC_with_optimizations) + { + // For some reason, clang has the builtin but it's not in IntrinsicsX86.td + // Use IR instead. + // This generates pmuldq with since LDC 1.2.0 -O0 + enum ir = ` + %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> + %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> + %la = sext <2 x i32> %ia to <2 x i64> + %lb = sext <2 x i32> %ib to <2 x i64> + %r = mul <2 x i64> %la, %lb + ret <2 x i64> %r`; + return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b); + } + else static if (LDC_with_ARM64) + { + // 3 instructions since LDC 1.8 -O2 + // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull + int2 a_lo = vmovn_s64(cast(long2)a); + int2 b_lo = vmovn_s64(cast(long2)b); + return cast(__m128i) vmull_s32(a_lo, b_lo); + } + else + { + int4 ia = cast(int4)a; + int4 ib = cast(int4)b; + long2 r; + r.ptr[0] = cast(long)ia.array[0] * ib.array[0]; + r.ptr[1] = cast(long)ia.array[2] * ib.array[2]; + return cast(__m128i)r; + } +} +unittest +{ + __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); + __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); + long2 R = cast(long2) _mm_mul_epi32(A, B); + long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144]; + assert(R.array == correct); +} + +/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, +/// return the low 32 bits of the intermediate integers. +__m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + // PERF GDC without SSE4.1 could be better + static if (GDC_with_SSE41) + { + int4 ia = cast(int4)a; + int4 ib = cast(int4)b; + // Note: older GDC doesn't have that op, but older GDC + // also has no support for -msse4.1 detection + return cast(__m128i)(a * b); + } + else version(LDC) + { + int4 ia = cast(int4)a; + int4 ib = cast(int4)b; + return cast(__m128i)(a * b); + } + else + { + // DMD doesn't take the above + int4 ia = cast(int4)a; + int4 ib = cast(int4)b; + int4 r; + r.ptr[0] = ia.array[0] * ib.array[0]; + r.ptr[1] = ia.array[1] * ib.array[1]; + r.ptr[2] = ia.array[2] * ib.array[2]; + r.ptr[3] = ia.array[3] * ib.array[3]; + return r; + } +} +unittest +{ + __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3); + __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0); + int4 R = cast(int4) _mm_mullo_epi32(A, B); + int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0]; + assert(R.array == correct); +} + + +/// Convert packed signed 32-bit integers from `a` and `b` +/// to packed 16-bit integers using unsigned saturation. +__m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted +{ + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); + } + else static if (LDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b); + } + else static if (LDC_with_ARM64) + { + int4 z; + z = 0; + return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)), + vqmovn_u32(vmaxq_s32(z, cast(int4)b))); + } + else + { + __m128i i32768 = _mm_set1_epi32(32768); + __m128i s32768 = _mm_set1_epi16(-32768); + a = _mm_sub_epi32(a, i32768); + b = _mm_sub_epi32(b, i32768); + __m128i clampedSigned = _mm_packs_epi32(a, b); + return _mm_add_epi16(clampedSigned, s32768); + } +} +unittest +{ + __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0); + short8 R = cast(short8) _mm_packus_epi32(A, A); + short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0]; + assert(R.array == correct); +} + + +/// Round the packed double-precision (64-bit) floating-point elements in `a` using the +/// rounding parameter, and store the results as packed double-precision floating-point elements. +/// Rounding is done according to the rounding[3:0] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +__m128d _mm_round_pd(int rounding)(__m128d a) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_roundpd(a, rounding); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_roundpd(a, rounding); + } + else + { + static if (rounding & _MM_FROUND_CUR_DIRECTION) + { + // Convert to 64-bit integers + long lo = _mm_cvtsd_si64(a); + a.ptr[0] = a.array[1]; + long hi = _mm_cvtsd_si64(a); + return _mm_setr_pd(lo, hi); + } + else + { + version(GNU) pragma(inline, false); // else fail unittest with optimizations + + uint old = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE((rounding & 3) << 13); + + // Convert to 64-bit integers + long lo = _mm_cvtsd_si64(a); + a.ptr[0] = a.array[1]; + long hi = _mm_cvtsd_si64(a); + + // Convert back to double to achieve the rounding + // The problem is that a 64-bit double can't represent all the values + // a 64-bit integer can (and vice-versa). So this function won't work for + // large values. (MAYDO: what range exactly?) + _MM_SET_ROUNDING_MODE(old); + return _mm_setr_pd(lo, hi); + } + } +} +unittest +{ + // tested in other intrinsics +} + +/// Round the packed single-precision (32-bit) floating-point elements in `a` using the +/// rounding parameter, and store the results as packed single-precision floating-point elements. +/// Rounding is done according to the rounding[3:0] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +__m128 _mm_round_ps(int rounding)(__m128 a) @trusted +{ + // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally + static if (GDC_or_LDC_with_SSE41) + { + return __builtin_ia32_roundps(a, rounding); + } + else + { + static if (rounding & _MM_FROUND_CUR_DIRECTION) + { + __m128i integers = _mm_cvtps_epi32(a); + return _mm_cvtepi32_ps(integers); + } + else + { + version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled + uint old = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE((rounding & 3) << 13); + scope(exit) _MM_SET_ROUNDING_MODE(old); + + // Convert to 64-bit integers + __m128i integers = _mm_cvtps_epi32(a); + + // Convert back to float to achieve the rounding + // The problem is that a 32-float can't represent all the values + // a 32-bit integer can (and vice-versa). So this function won't work for + // large values. (MAYDO: what range exactly?) + __m128 result = _mm_cvtepi32_ps(integers); + + return result; + } + } +} +unittest +{ + // tested in other intrinsics +} + + +/// Round the lower double-precision (64-bit) floating-point element in `b` using the +/// rounding parameter, store the result as a double-precision floating-point element +/// in the lower element of result, and copy the upper element from `a` to the upper element of result. +/// Rounding is done according to the rounding[3:0] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +__m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted +{ + static if (GDC_with_SSE41) + { + return __builtin_ia32_roundsd(a, b, rounding); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_roundsd(a, b, rounding); + } + else + { + static if (rounding & _MM_FROUND_CUR_DIRECTION) + { + // Convert to 64-bit integer + long b0 = _mm_cvtsd_si64(b); + a.ptr[0] = b0; + return a; + } + else + { + version(GNU) pragma(inline, false); // else fail unittest with optimizations + + uint old = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE((rounding & 3) << 13); + + // Convert to 64-bit integer + long b0 = _mm_cvtsd_si64(b); + a.ptr[0] = b0; + + // Convert back to double to achieve the rounding + // The problem is that a 64-bit double can't represent all the values + // a 64-bit integer can (and vice-versa). So this function won't work for + // large values. (MAYDO: what range exactly?) + _MM_SET_ROUNDING_MODE(old); + return a; + } + } +} +unittest +{ + // tested in other intrinsics +} + + +/// Round the lower single-precision (32-bit) floating-point element in `b` using the +/// rounding parameter, store the result as a single-precision floating-point element +/// in the lower element of result, and copy the upper 3 packed elements from `a` +/// to the upper elements of result. +/// Rounding is done according to the rounding[3:0] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +__m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted +{ + static if (GDC_with_SSE41) + { + return __builtin_ia32_roundss(a, b, rounding); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_roundss(a, b, rounding); + } + else + { + static if (rounding & _MM_FROUND_CUR_DIRECTION) + { + int b0 = _mm_cvtss_si32(b); + a.ptr[0] = b0; + return a; + } + else version(GNU) + { + pragma(inline, false) + __m128 GDCworkaround() nothrow @nogc @trusted + { + uint old = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE((rounding & 3) << 13); + + // Convert to 32-bit integer + int b0 = _mm_cvtss_si32(b); + a.ptr[0] = b0; + + // Convert back to double to achieve the rounding + // The problem is that a 32-bit float can't represent all the values + // a 32-bit integer can (and vice-versa). So this function won't work for + // large values. (MAYDO: what range exactly?) + _MM_SET_ROUNDING_MODE(old); + return a; + } + return GDCworkaround(); + } + else + { + uint old = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE((rounding & 3) << 13); + + // Convert to 32-bit integer + int b0 = _mm_cvtss_si32(b); + a.ptr[0] = b0; + + // Convert back to double to achieve the rounding + // The problem is that a 32-bit float can't represent all the values + // a 32-bit integer can (and vice-versa). So this function won't work for + // large values. (MAYDO: what range exactly?) + _MM_SET_ROUNDING_MODE(old); + return a; + } + } +} +unittest +{ + // tested in other intrinsics +} + + +/// Load 128-bits of integer data from memory using a non-temporal memory hint. +/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection +/// exception may be generated. +__m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted +{ + // PERF DMD D_SIMD + static if (GDC_with_SSE41) + { + return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr); + } + else static if (LDC_with_InlineIREx && LDC_with_optimizations) + { + enum prefix = `!0 = !{ i32 1 }`; + enum ir = ` + %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0 + ret <4 x i32> %r`; + return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr); + } + else + { + return *cast(__m128i*)mem_addr; // regular move instead + } +} +unittest +{ + align(16) static immutable int[4] correct = [1, 2, 3, 4]; + __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr)); + _mm_mfence(); + assert(A.array == correct); +} + +/// Return 1 if all bits in `a` are all 1's. Else return 0. +int _mm_test_all_ones (__m128i a) @safe +{ + return _mm_testc_si128(a, _mm_set1_epi32(-1)); +} +unittest +{ + __m128i A = _mm_set1_epi32(-1); + __m128i B = _mm_set_epi32(-1, -2, -1, -1); + assert(_mm_test_all_ones(A) == 1); + assert(_mm_test_all_ones(B) == 0); +} + +/// Return 1 if all bits in `a` are all 0's. Else return 0. +// This is a #BONUS since it was lacking in Intel Intrinsics API. +int _mm_test_all_zeros (__m128i a) @safe +{ + return _mm_testz_si128(a, _mm_set1_epi32(-1)); +} +unittest +{ + __m128i A = _mm_set1_epi32(0); + __m128i B = _mm_set_epi32(0, 8, 0, 0); + assert(_mm_test_all_zeros(A) == 1); + assert(_mm_test_all_zeros(B) == 0); +} + +/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, +/// and return 1 if the result is zero, otherwise return 0. +int _mm_test_all_zeros (__m128i a, __m128i mask) @safe +{ + return _mm_testz_si128(a, mask); // it's really the same, but with a good name +} + +/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 +/// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with +/// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and +/// CF values are zero, otherwise return 0. +int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted +{ + return _mm_testnzc_si128(a, mask); +} + +/// Compute the bitwise NOT of a and then AND with b, and return 1 if the +/// result is zero, otherwise return 0. +/// In other words, test if all bits masked by `b` are 1 in `a`. +int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b); + } + else static if (LDC_with_ARM64) + { + // Acceptable since LDC 1.8 -02 + long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + } + else + { + __m128i c = ~a & b; + int[4] zero = [0, 0, 0, 0]; + return c.array == zero; + } +} +unittest +{ + __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); + __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00); + __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); + assert(_mm_testc_si128(A, A) == 1); + assert(_mm_testc_si128(A, M1) == 0); + assert(_mm_testc_si128(A, M2) == 1); +} + +/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, +/// and set ZF to 1 if the result is zero, otherwise set ZF to 0. +/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the +/// result is zero, otherwise set CF to 0. +/// Return 1 if both the ZF and CF values are zero, otherwise return 0. +int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b); + } + else static if (LDC_with_ARM64) + { + long2 s640 = vandq_s64(cast(long2)b, cast(long2)a); + long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a); + + return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)) + | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) ); + } + else + { + __m128i c = a & b; + __m128i d = ~a & b; + int[4] zero = [0, 0, 0, 0]; + return !( (c.array == zero) || (d.array == zero)); + } +} +unittest +{ + __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); + __m128i M = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00); + __m128i Z = _mm_setzero_si128(); + assert(_mm_testnzc_si128(A, Z) == 0); + assert(_mm_testnzc_si128(A, M) == 1); + assert(_mm_testnzc_si128(A, A) == 0); +} + +/// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +/// and return 1 if the result is zero, otherwise return 0. +/// In other words, test if all bits masked by `b` are 0 in `a`. +int _mm_testz_si128 (__m128i a, __m128i b) @trusted +{ + // PERF DMD + static if (GDC_with_SSE41) + { + return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); + } + else static if (LDC_with_SSE41) + { + return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b); + } + else static if (LDC_with_ARM64) + { + // Acceptable since LDC 1.8 -02 + long2 s64 = vandq_s64(cast(long2)a, cast(long2)b); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + } + else + { + __m128i c = a & b; + int[4] zero = [0, 0, 0, 0]; + return c.array == zero; + } +} +unittest +{ + __m128i A = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8); + __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07); + __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00); + assert(_mm_testz_si128(A, A) == 0); + assert(_mm_testz_si128(A, M1) == 1); + assert(_mm_testz_si128(A, M2) == 0); +} + diff --git a/src/VulkanRenderer b/src/VulkanRenderer index d46741a..c42238a 160000 --- a/src/VulkanRenderer +++ b/src/VulkanRenderer @@ -1 +1 @@ -Subproject commit d46741a48033b5136fa189c1b80a574986e68f64 +Subproject commit c42238a456f5048c7d1b2d5ebd71ecf13bb10ece diff --git a/src/dlib b/src/dlib index c83ffab..493c17c 160000 --- a/src/dlib +++ b/src/dlib @@ -1 +1 @@ -Subproject commit c83ffabce69071a3e7a0af3f26aa420082eeda1f +Subproject commit 493c17cba26952861ae4c5402f1676013b8317c6 diff --git a/src/gears/game.d b/src/gears/game.d index 2b08381..5985462 100644 --- a/src/gears/game.d +++ b/src/gears/game.d @@ -1,4 +1,5 @@ import dlib; +/* public import vulkan : PlatformHandles; import vulkan : Destroy; import vulkan; @@ -414,6 +415,7 @@ DrawRect(Game* g, f32 p0_x, f32 p0_y, f32 p1_x, f32 p1_y, Vec4 col) AddUIIndices(g); } +/* // TODO: integrate this with vulkan again Model LoadModel(Game* g, string name) @@ -697,3 +699,4 @@ ReadModel(Game* g, string name) const(char)[] mat_name = m3d.material[i].name[0 .. strlen(m3d.material[i].name)]; } } +*/ diff --git a/src/gears/game2.d b/src/gears/game2.d index cec3b92..7a80349 100644 --- a/src/gears/game2.d +++ b/src/gears/game2.d @@ -10,8 +10,6 @@ ImageView[256] TEXTURES; Buffer[256] MATERIALS; Buffer[256] MODEL_STATES; -DescIndices[DESC_SET_MAX] DESC_INDICES]; - struct GameState { RenderState rds; @@ -29,6 +27,7 @@ struct RenderState Pipeline[PID.Max] pipelines; DescSetLayout desc_layout_globals; DescSetLayout desc_layout_resources; + DescSetLayout desc_layout_state; DescSet[2] desc_set_globals; PipelineLayout pipeline_layout_pbr; @@ -41,13 +40,6 @@ struct RenderState Buffer globals_buffer; } -struct DescIndices -{ - u32 tex; - u32 mat; - u32 state; -} - struct ShaderGlobals { Vec4 ambient; @@ -57,41 +49,6 @@ struct ShaderGlobals f32 alpha = 0.0; } -struct MeshPart -{ - u32 mat; - u32 offset; - u32 length; - PushConst pc; - - alias pc this; -} - -struct ModelData -{ - MeshPart[] parts; - ModelState state; - Vertex[] v; - u32[] idx; - TextureData[] tex; -} - -struct TextureData -{ - u8[] name; - u8[] data; - u32 width; - u32 height; - u32 ch; -} - -struct Model -{ - Buffer v_buf; - Buffer i_buf; - MeshPart[] parts; -} - struct ModelRenderInfo { PushConst pc; @@ -105,15 +62,6 @@ struct ModelState Mat4 matrix; } -struct Material -{ - Vec4 ambient; - Vec4 diffuse; - Vec4 specular; - float shininess; - float alpha; -} - enum PBRMod : u32 { AlbedoValue = 0x0001, @@ -182,17 +130,14 @@ struct PushConst } } -struct Vertex -{ - Vec4 col; - Vec4 tangent; - Vec3 pos; - Vec3 normal; - Vec2 uv; -} - ModelData g_box; +void +RunCycle(GameState* g) +{ + +} + GameState InitGame(PlatformWindow* window) { @@ -225,7 +170,7 @@ Init(RenderState* rds, PlatformWindow* window) { binding: 2, descriptorType: DT.StorageImage, descriptorCount: 1, stageFlags: SS.All }, ]; - DescLayoutBinding[3] resource_bindings = [ + DescLayoutBinding[6] resource_bindings = [ { binding: 0, descriptorType: DT.Image, descriptorCount: 1, stageFlags: SS.All }, { binding: 1, descriptorType: DT.Image, descriptorCount: 1, stageFlags: SS.All }, { binding: 2, descriptorType: DT.Image, descriptorCount: 1, stageFlags: SS.All }, @@ -234,8 +179,12 @@ Init(RenderState* rds, PlatformWindow* window) { binding: 5, descriptorType: DT.Uniform, descriptorCount: 1, stageFlags: SS.All }, ]; + DescLayoutBinding[1] state_bindings = [ + { binding: 0, descriptorType: DT.Uniform, descriptorCount: 1, stageFlags: SS.All }, + ]; + Attribute[5] attributes = [ - { binding: 0, location: 0, format: FMT.RGBA_F32, offset: Vertex.col.offsetof }, + { binding: 0, location: 0, format: FMT.RGBA_F32, offset: Vertex.color.offsetof }, { binding: 0, location: 1, format: FMT.RGBA_F32, offset: Vertex.tangent.offsetof }, { binding: 0, location: 2, format: FMT.RGB_F32, offset: Vertex.pos.offsetof }, { binding: 0, location: 3, format: FMT.RGB_F32, offset: Vertex.normal.offsetof }, @@ -251,8 +200,9 @@ Init(RenderState* rds, PlatformWindow* window) rds.desc_layout_globals = CreateDescSetLayout(&rds.rd, global_bindings); rds.desc_layout_resources = CreateDescSetLayout(&rds.rd, resource_bindings); + rds.desc_layout_state = CreateDescSetLayout(&rds.rd, state_bindings); - rds.pipeline_layout_pbr = CreatePipelineLayout(&rds.rd, [rds.desc_layout_globals, rds.desc_layout_resources], PushConst.sizeof); + rds.pipeline_layout_pbr = CreatePipelineLayout(&rds.rd, [rds.desc_layout_globals, rds.desc_layout_resources, rds.desc_layout_state], PushConst.sizeof); foreach(i; 0 .. 2) { @@ -273,8 +223,8 @@ Init(RenderState* rds, PlatformWindow* window) }; GfxPipelineInfo pbr_info = { - vertex_shader: LoadAssetData(&rds.frame_arenas[0], "shaders/pbr.vert.spv"), - frag_shader: LoadAssetData(&rds.frame_arenas[0], "shaders/pbr.frag.spv"), + vertex_shader: LoadFile(&rds.frame_arenas[0], "assets/shaders/pbr.vert.spv"), + frag_shader: LoadFile(&rds.frame_arenas[0], "assets/shaders/pbr.frag.spv"), input_rate: IR.Vertex, input_rate_stride: Vertex.sizeof, layout: rds.pipeline_layout_pbr, @@ -315,7 +265,7 @@ Init(RenderState* rds, PlatformWindow* window) CreateBuffer(&rds.rd, &rds.globals_buffer, BT.Uniform, ShaderGlobals.sizeof, false); - g_box = MakeBox + ModelData md = LoadGLTF(&rds.frame_arenas[0], "assets/models/DamagedHelmet.glb"); } PipelineID @@ -381,6 +331,7 @@ GetPBRMod(bool albedo = false, bool ambient = false, bool specular = false, bool } } +/* ModelData MakeBox(RenderState* rds, f32 width, f32 height, Vec4 col) { @@ -428,9 +379,9 @@ Model Upload(RenderState* rds, ModelData* data) { Model model; - u32[] tex_idx = Alloc!(&rds.frame_arenas[0], data.text.length); - u32[] mat_idx = Alloc!(&rds.frame_arenas[0], data.materials.length); - u32[] state_idx = Alloc!(&rds.frame_arenas[0], data.model_states.length); + u32[] tex_idx = Alloc!(u32)(&rds.frame_arenas[0], data.text.length); + u32[] mat_idx = Alloc!(u32)(&rds.frame_arenas[0], data.materials.length); + u32[] state_idx = Alloc!(u32)(&rds.frame_arenas[0], data.model_states.length); bool result = true; @@ -450,20 +401,18 @@ Upload(RenderState* rds, ModelData* data) { Buffer* buf = &rds.materials[rds.imat++]; CreateBuffer(&rds.rd, buf, BT.Uniform, Material.sizeof); - result = Transfer(&rds.rd, buf, ) + //result = Transfer(&rds.rd, buf, ) } for(u64 i = 0; i < data.model_states.length; i += 1) { Buffer* buf = &rds.model_states[rds.istate++]; - CreateBuffer(&rds) + //CreateBuffer(&rds) } model.parts = data.parts; } - -DescIndices* -GetDescIndices() +*/ unittest { diff --git a/src/shaders/structures.layout b/src/shaders/structures.layout index 5e18426..fafab1a 100644 --- a/src/shaders/structures.layout +++ b/src/shaders/structures.layout @@ -44,7 +44,8 @@ layout (set = 2, binding = 4) uniform MaterialData { float shininess; float alpha; } Material; -layout (set = 2, binding = 5) uniform ModelState { + +layout (set = 3, binding = 0) uniform ModelState { mat4 model_matrix; } State;