diff --git a/assets/models/DamagedHelmet.glb b/assets/models/DamagedHelmet.glb
new file mode 100644
index 0000000..2cee76d
Binary files /dev/null and b/assets/models/DamagedHelmet.glb differ
diff --git a/assets/shaders/gradient.comp.spv b/assets/shaders/gradient.comp.spv
index 6b76056..6fdb516 100644
Binary files a/assets/shaders/gradient.comp.spv and b/assets/shaders/gradient.comp.spv differ
diff --git a/assets/shaders/pbr.frag.spv b/assets/shaders/pbr.frag.spv
index 4e609a2..f91b9e3 100644
Binary files a/assets/shaders/pbr.frag.spv and b/assets/shaders/pbr.frag.spv differ
diff --git a/assets/shaders/pbr.vert.spv b/assets/shaders/pbr.vert.spv
index 12d75f6..1a9aa13 100644
Binary files a/assets/shaders/pbr.vert.spv and b/assets/shaders/pbr.vert.spv differ
diff --git a/dub.json b/dub.json
index 516ed99..eee67ee 100644
--- a/dub.json
+++ b/dub.json
@@ -7,7 +7,7 @@
 			"targetType": "executable",
 			"targetName": "Gears",
 			"targetPath": "build",
-			"sourceFiles-linux": ["build/libvma.a", "build/libstb.a", "build/libm3d.a", "build/libcglm.a"],
+			"sourceFiles-linux": ["build/libvma.a", "build/libstb.a", "build/libm3d.a", "build/libcglm.a", "build/libcgltf.a"],
 			"sourceFiles-windows": [],
 			"importPaths": ["src/gears", "src/dlib", "src/dlib/external/xxhash", "src/VulkanRenderer"],
 			"sourcePaths": ["src/gears", "src/dlib", "src/dlib/external/xxhash", "src/VulkanRenderer"],
@@ -17,7 +17,7 @@
 			"preGenerateCommands-linux": ["./build.sh"],
 			"preGenerateCommands-windows": [],
 			"dflags": ["-Xcc=-mno-sse", "-P-I/usr/include/freetype2", "-Jbuild", "-Jassets/fonts"],
-			"dflags-dmd": ["-P=-DSTBI_NO_SIMD"]
+			"dflags-dmd": []
 		},
 		{
 			"name": "packer",
diff --git a/external/inteli/bmi2intrin.d b/external/inteli/bmi2intrin.d
index 50bcde7..644a23c 100644
--- a/external/inteli/bmi2intrin.d
+++ b/external/inteli/bmi2intrin.d
@@ -1,363 +1,363 @@
-/**
-* BMI2 intrinsics.
-* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
-*
-* Copyright: Copyright Johan Engelen 2021.
-* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
-*/
-module inteli.bmi2intrin;
-
-import inteli.internals;
-
-nothrow @nogc pure @safe:
-
-/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
-uint _bzhi_u32 (uint a, uint index)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-            return __builtin_ia32_bzhi_si(a, index);
-        else
-            return bzhi!uint(a, index);
-    }
-    else
-    {
-        return bzhi!uint(a, index);
-    }
-}
-unittest
-{
-    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
-           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
-    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
-           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
-    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
-           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
-}
-
-/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
-ulong _bzhi_u64 (ulong a, uint index)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-        {
-            version(X86_64)
-            {
-                // This instruction not available in 32-bit x86.
-                return __builtin_ia32_bzhi_di(a, index);
-            }
-            else
-                return bzhi!ulong(a, index);
-        }
-        else
-            return bzhi!ulong(a, index);
-    }
-    else
-    {
-        return bzhi!ulong(a, index);
-    }
-}
-unittest
-{
-    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
-           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
-    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
-           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
-    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
-           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
-    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
-           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
-}
-
-// Helper function for BZHI
-private T bzhi(T)(T a, uint index)
-{
-    /+
-        n := index[7:0]
-        dst := a
-        IF (n < number of bits)
-            dst[MSB:n] := 0
-        FI
-    +/
-    enum numbits = T.sizeof*8;
-    T dst = a;
-    if (index < numbits)
-    {
-        T mask = (T(1) << index) - 1;
-        dst &= mask;
-    }
-    return dst;
-}
-
-/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
-/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
-/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
-///       But, those particular semantics don't exist at the level of intrinsics.
-uint _mulx_u32 (uint a, uint b, uint* hi)
-{
-    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
-    // some reason, even with LLVM IR.
-    // Also same with GDC.
-    ulong result = cast(ulong) a * b;
-    *hi = cast(uint) (result >>> 32);
-    return cast(uint)result;
-}
-@system unittest
-{
-    uint hi;
-    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
-    assert (hi == 0x014B_66DC);
-}
-
-/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
-/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
-/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
-///       But, those particular semantics don't exist at the level of intrinsics.
-ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
-{
-    /+
-        dst[63:0] := (a * b)[63:0]
-        MEM[hi+63:hi]  := (a * b)[127:64]
-    +/
-
-    static if (LDC_with_optimizations)
-    {
-        static if (__VERSION__ >= 2094)
-            enum bool withLDCIR = true;
-        else
-            enum bool withLDCIR = false;
-    }
-    else
-    {
-        enum bool withLDCIR = false;
-    }
-
-    static if (withLDCIR)
-    {
-        // LDC x86: Generates mulx from -O0
-        enum ir = `
-            %4 = zext i64 %0 to i128
-            %5 = zext i64 %1 to i128
-            %6 = mul nuw i128 %5, %4
-            %7 = lshr i128 %6, 64
-            %8 = trunc i128 %7 to i64
-            store i64 %8, i64* %2, align 8
-            %9 = trunc i128 %6 to i64
-            ret i64 %9`;
-        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
-    }
-    else
-    {
-        /+ Straight-forward implementation with `ucent`:
-        ucent result = cast(ucent) a * b;
-        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
-        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
-        +/
-
-        /+
-            Implementation using 64bit math is more complex...
-            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
-                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
-                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
-                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
-                  = z1 << 64  +  z0
-        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
-        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
-        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
-            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
-        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
-            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
-        +/
-
-        const ulong a_low = a & 0xFFFF_FFFF;
-        const ulong a_high = a >>> 32;
-        const ulong b_low = b & 0xFFFF_FFFF;
-        const ulong b_high = b >>> 32;
-
-        const ulong c2 = a_high*b_high;
-        const ulong c11 = a_high*b_low;
-        const ulong c12 = a_low*b_high;
-        const ulong c0 = a_low*b_low;
-
-        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
-        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
-        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
-
-        *hi = z1;
-        return z0;
-    }
-}
-@system unittest
-{
-    ulong hi;
-    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
-    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
-    assert (hi == 0x14b_66dc_33f6_acdc);
-}
-
-/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
-uint _pdep_u32 (uint a, uint mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-            return __builtin_ia32_pdep_si(a, mask);
-        else
-            return pdep!uint(a, mask);
-    }
-    else
-    {
-        return pdep!uint(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
-           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
-}
-
-/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
-ulong _pdep_u64 (ulong a, ulong mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-        {
-            version(X86_64)
-            {
-                // This instruction not available in 32-bit x86.
-                return __builtin_ia32_pdep_di(a, mask);
-            }
-            else
-                return pdep!ulong(a, mask);
-        }
-        else
-            return pdep!ulong(a, mask);
-    }
-    else
-    {
-        return pdep!ulong(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
-           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
-}
-
-// Helper function for PDEP
-private T pdep(T)(T a, T mask)
-{
-    /+
-        tmp := a
-        dst := 0
-        m := 0
-        k := 0
-        DO WHILE m < 32
-            IF mask[m] == 1
-                dst[m] := tmp[k]
-                k := k + 1
-            FI
-            m := m + 1
-        OD
-    +/
-    T dst;
-    T k_bitpos = 1;
-    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
-    foreach (m; 0..T.sizeof*8)
-    {
-        if (mask & m_bitpos)
-        {
-            dst |= (a & k_bitpos) ? m_bitpos : 0;
-            k_bitpos <<= 1;
-        }
-        m_bitpos <<= 1;
-    }
-    return dst;
-}
-
-
-/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
-/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
-uint _pext_u32 (uint a, uint mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-            return __builtin_ia32_pext_si(a, mask);
-        else
-            return pext!uint(a, mask);
-    }
-    else
-    {
-        return pext!uint(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
-           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
-}
-
-/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
-/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
-ulong _pext_u64 (ulong a, ulong mask)
-{
-    static if (GDC_or_LDC_with_BMI2)
-    {
-        if (!__ctfe)
-        {
-            version(X86_64)
-            {
-                // This instruction not available in 32-bit x86.
-                return __builtin_ia32_pext_di(a, mask);
-            }
-            else
-                return pext!ulong(a, mask);
-        }
-        else
-            return pext!ulong(a, mask);
-    }
-    else
-    {
-        return pext!ulong(a, mask);
-    }
-}
-unittest
-{
-    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
-           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
-}
-
-// Helper function for PEXT
-private T pext(T)(T a, T mask)
-{
-    /+
-        tmp := a
-        dst := 0
-        m := 0
-        k := 0
-        DO WHILE m < number of bits in T
-            IF mask[m] == 1
-                dst[k] := tmp[m]
-                k := k + 1
-            FI
-            m := m + 1
-        OD
-    +/
-    T dst;
-    T k_bitpos = 1;
-    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
-    foreach (m; 0..T.sizeof*8)
-    {
-        if (mask & m_bitpos)
-        {
-            dst |= (a & m_bitpos) ? k_bitpos : 0;
-            k_bitpos <<= 1;
-        }
-        m_bitpos <<= 1;
-    }
-    return dst;
-}
+/**
+* BMI2 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=BMI2
+*
+* Copyright: Copyright Johan Engelen 2021.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.bmi2intrin;
+
+import inteli.internals;
+
+nothrow @nogc pure @safe:
+
+/// Copy all bits from unsigned 32-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+uint _bzhi_u32 (uint a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_bzhi_si(a, index);
+        else
+            return bzhi!uint(a, index);
+    }
+    else
+    {
+        return bzhi!uint(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u32(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u32(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u32(0x1234_5678, 21) == 0x14_5678);
+}
+
+/// Copy all bits from unsigned 64-bit integer `a` to dst, and reset (set to 0) the high bits in dst starting at index.
+ulong _bzhi_u64 (ulong a, uint index)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_bzhi_di(a, index);
+            }
+            else
+                return bzhi!ulong(a, index);
+        }
+        else
+            return bzhi!ulong(a, index);
+    }
+    else
+    {
+        return bzhi!ulong(a, index);
+    }
+}
+unittest
+{
+    static assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+           assert (_bzhi_u64(0x1234_5678, 5) == 0x18);
+    static assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+           assert (_bzhi_u64(0x1234_5678, 10) == 0x278);
+    static assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+           assert (_bzhi_u64(0x1234_5678, 21) == 0x14_5678);
+    static assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+           assert (_bzhi_u64(0x8765_4321_1234_5678, 54) == 0x0025_4321_1234_5678);
+}
+
+// Helper function for BZHI
+private T bzhi(T)(T a, uint index)
+{
+    /+
+        n := index[7:0]
+        dst := a
+        IF (n < number of bits)
+            dst[MSB:n] := 0
+        FI
+    +/
+    enum numbits = T.sizeof*8;
+    T dst = a;
+    if (index < numbits)
+    {
+        T mask = (T(1) << index) - 1;
+        dst &= mask;
+    }
+    return dst;
+}
+
+/// Multiply unsigned 32-bit integers `a` and `b`, store the low 32-bits of the result in dst, 
+/// and store the high 32-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+uint _mulx_u32 (uint a, uint b, uint* hi)
+{
+    // Note: that does NOT generate mulx with LDC, and there seems to be no way to do that for
+    // some reason, even with LLVM IR.
+    // Also same with GDC.
+    ulong result = cast(ulong) a * b;
+    *hi = cast(uint) (result >>> 32);
+    return cast(uint)result;
+}
+@system unittest
+{
+    uint hi;
+    assert (_mulx_u32(0x1234_5678, 0x1234_5678, &hi) == 0x1DF4_D840);
+    assert (hi == 0x014B_66DC);
+}
+
+/// Multiply unsigned 64-bit integers `a` and `b`, store the low 64-bits of the result in dst, and 
+/// store the high 64-bits in `hi`. This does not read or write arithmetic flags.
+/// Note: the implementation _does_ set arithmetic flags, unlike the instruction semantics say.
+///       But, those particular semantics don't exist at the level of intrinsics.
+ulong _mulx_u64 (ulong a, ulong b, ulong* hi)
+{
+    /+
+        dst[63:0] := (a * b)[63:0]
+        MEM[hi+63:hi]  := (a * b)[127:64]
+    +/
+
+    static if (LDC_with_optimizations)
+    {
+        static if (__VERSION__ >= 2094)
+            enum bool withLDCIR = true;
+        else
+            enum bool withLDCIR = false;
+    }
+    else
+    {
+        enum bool withLDCIR = false;
+    }
+
+    static if (withLDCIR)
+    {
+        // LDC x86: Generates mulx from -O0
+        enum ir = `
+            %4 = zext i64 %0 to i128
+            %5 = zext i64 %1 to i128
+            %6 = mul nuw i128 %5, %4
+            %7 = lshr i128 %6, 64
+            %8 = trunc i128 %7 to i64
+            store i64 %8, i64* %2, align 8
+            %9 = trunc i128 %6 to i64
+            ret i64 %9`;
+        return LDCInlineIR!(ir, ulong, ulong, ulong, ulong*)(a, b, hi);
+    }
+    else
+    {
+        /+ Straight-forward implementation with `ucent`:
+        ucent result = cast(ucent) a * b;
+        *hi = cast(ulong) ((result >>> 64) & 0xFFFF_FFFF_FFFF_FFFF);
+        return cast(ulong) (result & 0xFFFF_FFFF_FFFF_FFFF);
+        +/
+
+        /+
+            Implementation using 64bit math is more complex...
+            a * b = (a_high << 32 + a_low) * (b_high << 32 + b_low)
+                  = (a_high << 32)*(b_high << 32) + (a_high << 32)*b_low + a_low* (b_high << 32) + a_low*b_low
+                  = (a_high*b_high) << 64 + (a_high*b_low) << 32 + (a_low*b_high) << 32 + a_low*b_low
+                  = c2 << 64 + c11 << 32 + c12 << 32 + c0
+                  = z1 << 64  +  z0
+        // The sums may overflow, so we need to carry the carry (from low 64bits to high 64bits). We can do that
+        // by separately creating the sum to get the high 32 bits of z0 using 64bit math. The high 32 bits of that
+        // intermediate result is then the 'carry' that we need to add when calculating z1's sum.
+            z0 = (c0 & 0xFFFF_FFFF) + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) << 32
+        The carry part from z0's sum = (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+            z1 = c2 + (c11 >> 32 + c12 >> 32 + (c0 >> 32 + c11 & 0xFFFF_FFFF + c12 & 0xFFFF_FFFF ) >> 32
+        +/
+
+        const ulong a_low = a & 0xFFFF_FFFF;
+        const ulong a_high = a >>> 32;
+        const ulong b_low = b & 0xFFFF_FFFF;
+        const ulong b_high = b >>> 32;
+
+        const ulong c2 = a_high*b_high;
+        const ulong c11 = a_high*b_low;
+        const ulong c12 = a_low*b_high;
+        const ulong c0 = a_low*b_low;
+
+        const ulong common_term = (c0 >> 32) + (c11 & 0xFFFF_FFFF) + (c12 & 0xFFFF_FFFF);
+        const ulong z0 = (c0 & 0xFFFF_FFFF) + (common_term << 32);
+        const ulong z1 = c2 + (c11 >> 32) + (c12 >> 32) + (common_term >> 32);
+
+        *hi = z1;
+        return z0;
+    }
+}
+@system unittest
+{
+    ulong hi;
+    // 0x1234_5678_9ABC_DEF0 * 0x1234_5678_9ABC_DEF0 == 0x14b_66dc_33f6_acdc_a5e2_0890_f2a5_2100
+    assert (_mulx_u64(0x1234_5678_9ABC_DEF0, 0x1234_5678_9ABC_DEF0, &hi) == 0xa5e2_0890_f2a5_2100);
+    assert (hi == 0x14b_66dc_33f6_acdc);
+}
+
+/// Deposit contiguous low bits from unsigned 32-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+uint _pdep_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pdep_si(a, mask);
+        else
+            return pdep!uint(a, mask);
+    }
+    else
+    {
+        return pdep!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+           assert (_pdep_u32(0x1234_5678, 0x0F0F_0F0F) == 0x0506_0708);
+}
+
+/// Deposit contiguous low bits from unsigned 64-bit integer `a` to dst at the corresponding bit locations specified by `mask`; all other bits in dst are set to zero.
+ulong _pdep_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pdep_di(a, mask);
+            }
+            else
+                return pdep!ulong(a, mask);
+        }
+        else
+            return pdep!ulong(a, mask);
+    }
+    else
+    {
+        return pdep!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+           assert (_pdep_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x0807_0605_0403_0201);
+}
+
+// Helper function for PDEP
+private T pdep(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < 32
+            IF mask[m] == 1
+                dst[m] := tmp[k]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & k_bitpos) ? m_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
+
+
+/// Extract bits from unsigned 32-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+uint _pext_u32 (uint a, uint mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+            return __builtin_ia32_pext_si(a, mask);
+        else
+            return pext!uint(a, mask);
+    }
+    else
+    {
+        return pext!uint(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+           assert (_pext_u32(0x1234_5678, 0x0F0F_0F0F) == 0x2468);
+}
+
+/// Extract bits from unsigned 64-bit integer `a` at the corresponding bit locations specified by 
+/// `mask` to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
+ulong _pext_u64 (ulong a, ulong mask)
+{
+    static if (GDC_or_LDC_with_BMI2)
+    {
+        if (!__ctfe)
+        {
+            version(X86_64)
+            {
+                // This instruction not available in 32-bit x86.
+                return __builtin_ia32_pext_di(a, mask);
+            }
+            else
+                return pext!ulong(a, mask);
+        }
+        else
+            return pext!ulong(a, mask);
+    }
+    else
+    {
+        return pext!ulong(a, mask);
+    }
+}
+unittest
+{
+    static assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+           assert (_pext_u64(0x1234_5678_8765_4321, 0x0F0F_0F0F_0F0F_0F0F) == 0x2468_7531);
+}
+
+// Helper function for PEXT
+private T pext(T)(T a, T mask)
+{
+    /+
+        tmp := a
+        dst := 0
+        m := 0
+        k := 0
+        DO WHILE m < number of bits in T
+            IF mask[m] == 1
+                dst[k] := tmp[m]
+                k := k + 1
+            FI
+            m := m + 1
+        OD
+    +/
+    T dst;
+    T k_bitpos = 1;
+    T m_bitpos = 1; // for each iteration, this has one bit set to 1 in the position probed
+    foreach (m; 0..T.sizeof*8)
+    {
+        if (mask & m_bitpos)
+        {
+            dst |= (a & m_bitpos) ? k_bitpos : 0;
+            k_bitpos <<= 1;
+        }
+        m_bitpos <<= 1;
+    }
+    return dst;
+}
diff --git a/external/inteli/smmintrin.d b/external/inteli/smmintrin.d
index 16fdb47..bf2f517 100644
--- a/external/inteli/smmintrin.d
+++ b/external/inteli/smmintrin.d
@@ -1,2215 +1,2215 @@
-/**
-* SSE4.1 intrinsics.
-* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
-*
-* Copyright: Guillaume Piolat 2021.
-*            Johan Engelen 2021.
-*            cet 2024.
-* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
-*/
-module inteli.smmintrin;
-
-// SSE4.1 instructions
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
-// Note: this header will work whether you have SSE4.1 enabled or not.
-// With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
-// generate SSE4.1 instructions.
-// With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
-
-public import inteli.types;
-import inteli.internals;
-
-// smmintrin pulls in all previous instruction set intrinsics.
-public import inteli.tmmintrin;
-
-nothrow @nogc:
-
-enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
-enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
-enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
-enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
-enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
-enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
-enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
-
-enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
-enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
-enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
-enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
-enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
-enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
-
-/// Add packed signed 32-bit integers in `a` and `b` using saturation.
-/// #BONUS
-__m128i _mm_adds_epi32(__m128i a, __m128i b) pure
-{
-    // PERF: ARM64 should use 2x vqadd_s32
-    static if (LDC_with_saturated_intrinsics)
-        return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b);
-    else
-    {
-        __m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
-        __m128i res = _mm_add_epi32(a, b);
-        __m128i sign_bit = _mm_srli_epi32(a, 31);
-        __m128i sign_xor  = _mm_xor_si128(a, b);
-        __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
-        __m128i saturated = _mm_add_epi32(int_max, sign_bit);
-        return cast(__m128i) _mm_blendv_ps(cast(__m128)res, 
-            cast(__m128)saturated, 
-            cast(__m128)overflow);
-    }
-}
-unittest
-{
-    __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min);
-    __m128i b = _mm_setr_epi32(1, 2, 3, -4);
-    assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]);
-}
-
-/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
-// Note: changed signature, GDC needs a compile-time value for imm8.
-__m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16
-        return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
-    }
-    else 
-    {
-        // LDC x86 This generates pblendw since LDC 1.1 and -O2
-        short8 r;
-        short8 sa = cast(short8)a;
-        short8 sb = cast(short8)b;
-        for (int n = 0; n < 8; ++n)
-        {
-            r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
-        }
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
-    __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
-    short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
-    short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
-    assert(C.array == correct);
-}
-
-
-/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
-// Note: changed signature, GDC needs a compile-time value for `imm8`.
-__m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
-{
-    static assert(imm8 >= 0 && imm8 < 4);
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
-    }
-    else
-    {
-        // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
-        double2 r;
-        for (int n = 0; n < 2; ++n)
-        {
-            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
-        }
-        return cast(__m128d)r;
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(0, 1);
-    __m128d B = _mm_setr_pd(8, 9);
-    double2 C = _mm_blend_pd!2(A, B);
-    double[2] correct =    [0, 9];
-    assert(C.array == correct);
-}
-
-
-/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
-/// mask `imm8`.
-// Note: changed signature, GDC needs a compile-time value for imm8.
-__m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted
-{
-    // PERF DMD
-    static assert(imm8 >= 0 && imm8 < 16);
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_blendps(a, b, imm8);
-    }
-    else version(LDC)
-    {
-        // LDC x86: generates blendps since LDC 1.1 -O2
-        //   arm64: pretty good, two instructions worst case
-        return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
-                                         (imm8 & 2) ? 5 : 1,
-                                         (imm8 & 4) ? 6 : 2,
-                                         (imm8 & 8) ? 7 : 3)(a, b);
-    }
-    else
-    {
-        // PERF GDC without SSE4.1 is quite bad
-        __m128 r;
-        for (int n = 0; n < 4; ++n)
-        {
-            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
-        }
-        return r;
-    }
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(0, 1,  2,  3);
-    __m128 B = _mm_setr_ps(8, 9, 10, 11);
-    float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
-    float[4] correct =    [8, 1, 10, 11];
-    assert(C.array == correct);
-}
-
-/// Blend packed 8-bit integers from `a` and `b` using `mask`.
-/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
-__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted
-{
-    // PERF DMD
-    /*static if (GDC_with_SSE41)
-    {
-        // This intrinsic do nothing in GDC 12.
-        // TODO report to GDC. No problem in GCC.
-        return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
-    }
-    else*/
-    static if (LDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        // LDC arm64: two instructions since LDC 1.12 -O2
-        byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
-        return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
-    }
-    else
-    {
-        __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
-        return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
-                               8,  9, 10, 11, 12, 13, 14, 15);
-    __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
-                              24, 25, 26, 27, 28, 29, 30, 31);
-    __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
-                               1,  1, -1, -1,  4,  1,  8, -128);
-    byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
-    byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
-                               8,  9, 26, 27, 12, 13, 14, 31 ];
-    assert(R.array == correct);
-}
-
-
-/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
-__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE42)
-    {
-        // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
-        // with -msse4.2 but not -msse4.1.
-        // Not sure what is the reason, and there is a replacement sequence.
-        // Sounds like a bug.
-        return __builtin_ia32_blendvpd(a, b, mask);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_blendvpd(a, b, mask);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        long2 shift;
-        shift = 63;
-        long2 lmask = cast(long2)mask >> shift;
-        return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
-    }
-    else
-    {
-        __m128d r; // PERF =void;
-        long2 lmask = cast(long2)mask;
-        for (int n = 0; n < 2; ++n)
-        {
-            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
-        }
-        return r;
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(1.0, 2.0);
-    __m128d B = _mm_setr_pd(3.0, 4.0);
-    __m128d M1 = _mm_setr_pd(-3.0, 2.0);
-    __m128d R1 = _mm_blendv_pd(A, B, M1);
-    double[2] correct1 = [3.0, 2.0];
-    assert(R1.array == correct1);
-
-    // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
-    // See Issue #78
-    __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
-    __m128d R2 = _mm_blendv_pd(A, B, M2);
-    double[2] correct2 = [1.0, 2.0];
-    assert(R2.array == correct2);
-}
-
-
-/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
-__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_blendvps(a, b, mask);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_blendvps(a, b, mask);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        int4 shift;
-        shift = 31;
-        int4 lmask = cast(int4)mask >> shift;
-        return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
-    }
-    else
-    {
-        // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2
-        // If lack of optimization, consider replacing by:
-        //  __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
-        //    return _mm_or_si128(
-        //        _mm_and_si128(overflow_mask, saturated),
-        //        _mm_andnot_si128(overflow_mask, res)
-        // LLVM makes almost the same sequence when optimized.
-        __m128 r;
-        int4 lmask = cast(int4)mask;
-        for (int n = 0; n < 4; ++n)
-        {
-            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
-        }
-        return r;
-    }
-}
-unittest
-{
-    __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
-    __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
-    __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
-    __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
-    __m128 R1 = _mm_blendv_ps(A, B, M1);
-    __m128 R2 = _mm_blendv_ps(A, B, M2);
-    float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
-    float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
-    assert(R1.array == correct1);
-
-    // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
-    // See Issue #78
-    assert(R2.array == correct2);
-}
-
-/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
-/// and store the results as packed double-precision floating-point elements.
-__m128d _mm_ceil_pd (__m128d a) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        // LDC arm64 acceptable since 1.8 -O2
-        // Unfortunately x86 intrinsics force a round-trip back to double2
-        // ARM neon semantics wouldn't have that
-        long2 l = vcvtpq_s64_f64(a);
-        double2 r;
-        r.ptr[0] = l.array[0];
-        r.ptr[1] = l.array[1];
-        return r;
-    }
-    else
-    {
-        return _mm_round_pd!2(a);
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(1.3f, -2.12f);
-    __m128d B = _mm_setr_pd(53.6f, -2.7f);
-    A = _mm_ceil_pd(A);
-    B = _mm_ceil_pd(B);
-    double[2] correctA = [2.0, -2.0];
-    double[2] correctB = [54.0, -2.0];
-    assert(A.array == correctA);
-    assert(B.array == correctB);
-}
-
-/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
-/// and store the results as packed single-precision floating-point elements.
-__m128 _mm_ceil_ps (__m128 a) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        // LDC arm64 acceptable since 1.8 -O1
-        int4 l = vcvtpq_s32_f32(a);
-        float4 r;
-        r.ptr[0] = l.array[0];
-        r.ptr[1] = l.array[1];
-        r.ptr[2] = l.array[2];
-        r.ptr[3] = l.array[3];
-        return r;
-    }
-    else
-    {
-        return _mm_round_ps!2(a);
-    }
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
-    __m128 C = _mm_ceil_ps(A);
-    float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
-    assert(C.array == correct);
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
-/// store the result as a double-precision floating-point element in the lower element of result, 
-/// and copy the upper element from `a` to the upper element of dst.
-__m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        a[0] = vcvtps_s64_f64(b[0]);
-        return a;
-    }
-    else
-    {
-        return _mm_round_sd!2(a, b);
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(1.3, -2.12);
-    __m128d B = _mm_setr_pd(53.6, -3.7);
-    __m128d C = _mm_ceil_sd(A, B);
-    double[2] correct = [54.0, -2.12];
-    assert(C.array == correct);
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
-/// store the result as a single-precision floating-point element in the lower element of result, 
-/// and copy the upper 3 packed elements from `a` to the upper elements of result.
-__m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        a[0] = vcvtps_s32_f32(b[0]);
-        return a;
-    }
-    else
-    {
-        return _mm_round_ss!2(a, b);
-    }
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
-    __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
-    __m128 C = _mm_ceil_ss(A, B);
-    float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
-    assert(C.array == correct);
-}
-
-/// Compare packed 64-bit integers in `a` and `b` for equality.
-__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
-{
-    static if (SIMD_COMPARISON_MASKS_16B)
-    {
-        version(DigitalMars)
-        {
-            // DMD doesn't recognize long2 == long2
-            long2 la = cast(long2)a;
-            long2 lb = cast(long2)b;
-            long2 res;
-            res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
-            res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
-            return cast(__m128i)res;
-        }
-        else
-        {
-            return cast(__m128i)(cast(long2)a == cast(long2)b);
-        }
-    }
-    else static if (GDC_with_SSE41)
-    {
-        return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
-    }
-    else version(LDC)
-    {
-        // LDC x86: generates pcmpeqq since LDC 1.1 -O1
-        //     arm64: generates cmeq since LDC 1.8 -O1
-        return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
-    }
-    else
-    {
-        // Clever pcmpeqd + pand use with LDC 1.24 -O2
-        long2 la = cast(long2)a;
-        long2 lb = cast(long2)b;
-        long2 res;
-        res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
-        res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
-        return cast(__m128i)res;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi64(-1, -2);
-    __m128i B = _mm_setr_epi64(-3, -2);
-    __m128i C = _mm_setr_epi64(-1, -4);
-    long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
-    long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
-    long[2] correct1 = [0, -1];
-    long[2] correct2 = [-1, 0];
-    assert(AB.array == correct1);
-    assert(AC.array == correct2);
-}
-
-
-/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
-__m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
-    }
-    else static if (LDC_with_optimizations)
-    {
-        // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
-        enum ir = `
-            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
-            %r = sext <4 x i16> %v to <4 x i32>
-            ret <4 x i32> %r`;
-        return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
-    }
-    else
-    {
-        short8 sa = cast(short8)a;
-        int4 r;
-        r.ptr[0] = sa.array[0];
-        r.ptr[1] = sa.array[1];
-        r.ptr[2] = sa.array[2];
-        r.ptr[3] = sa.array[3];
-        return r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
-    int4 C = cast(int4) _mm_cvtepi16_epi32(A);
-    int[4] correct = [-1, 0, -32768, 32767];
-    assert(C.array == correct);
-}
-
-/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
-__m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
-    }
-    else static if (LDC_with_optimizations)
-    {
-        // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
-        enum ir = `
-            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
-            %r = sext <2 x i16> %v to <2 x i64>
-            ret <2 x i64> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
-    }
-    else
-    {
-        short8 sa = cast(short8)a;
-        long2 r;
-        r.ptr[0] = sa.array[0];
-        r.ptr[1] = sa.array[1];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
-    long2 C = cast(long2) _mm_cvtepi16_epi64(A);
-    long[2] correct = [-32768, 32767];
-    assert(C.array == correct);
-}
-
-/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
-__m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
-    }
-    else static if (LDC_with_optimizations)
-    {
-        // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
-        enum ir = `
-            %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
-            %r = sext <2 x i32> %v to <2 x i64>
-            ret <2 x i64> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
-    }
-    else
-    {
-        int4 sa = cast(int4)a;
-        long2 r;
-        r.ptr[0] = sa.array[0];
-        r.ptr[1] = sa.array[1];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
-    long2 C = cast(long2) _mm_cvtepi32_epi64(A);
-    long[2] correct = [-4, 42];
-    assert(C.array == correct);
-}
-
-
-/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
-__m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        alias ubyte16 = __vector(ubyte[16]);
-        return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
-    }
-    else static if (LDC_with_optimizations)
-    {
-        // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
-        // LDC ARM64: sshll generated since LDC 1.8.0 -O1
-        enum ir = `
-            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-            %r = sext <8 x i8> %v to <8 x i16>
-            ret <8 x i16> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
-    }
-    else
-    {
-        byte16 sa = cast(byte16)a;
-        short8 r;
-        foreach(n; 0..8)
-            r.ptr[n] = sa.array[n];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-    short8 C = cast(short8) _mm_cvtepi8_epi16(A);
-    short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
-    assert(C.array == correct);
-}
-
-
-/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
-__m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        alias ubyte16 = __vector(ubyte[16]);
-        return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
-    }
-    else static if (LDC_with_SSE41 && LDC_with_optimizations)
-    {
-        // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
-        enum ir = `
-            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-            %r = sext <4 x i8> %v to <4 x i32>
-            ret <4 x i32> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
-    }
-    else
-    {
-        // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
-        byte16 sa = cast(byte16)a;
-        int4 r;
-        r.ptr[0] = sa.array[0];
-        r.ptr[1] = sa.array[1];
-        r.ptr[2] = sa.array[2];
-        r.ptr[3] = sa.array[3];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-    int4 C = cast(int4) _mm_cvtepi8_epi32(A);
-    int[4] correct = [127, -128, 1, -1];
-    assert(C.array == correct);
-}
-
-
-/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
-__m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        alias ubyte16 = __vector(ubyte[16]);
-        return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
-    }
-    else static if (LDC_with_optimizations)
-    {
-        // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
-        // LDC arm64: it's ok since LDC 1.8 -O1
-        enum ir = `
-            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
-            %r = sext <2 x i8> %v to <2 x i64>
-            ret <2 x i64> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
-    }
-    else
-    {
-        byte16 sa = cast(byte16)a;
-        long2 r;
-        foreach(n; 0..2)
-            r.ptr[n] = sa.array[n];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-    long2 C = cast(long2) _mm_cvtepi8_epi64(A);
-    long[2] correct = [127, -128];
-    assert(C.array == correct);
-}
-
-
-/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
-__m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
-    }
-    else
-    {
-        // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
-        //     arm64: ushll since LDC 1.12 -O1
-        short8 sa = cast(short8)a;
-        int4 r;
-        r.ptr[0] = cast(ushort)sa.array[0];
-        r.ptr[1] = cast(ushort)sa.array[1];
-        r.ptr[2] = cast(ushort)sa.array[2];
-        r.ptr[3] = cast(ushort)sa.array[3];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
-    int4 C = cast(int4) _mm_cvtepu16_epi32(A);
-    int[4] correct = [65535, 0, 32768, 32767];
-    assert(C.array == correct);
-}
-
-
-/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
-__m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        // LDC arm64: a bit shorter than below, in -O2
-        short8 sa = cast(short8)a;
-        long2 r;
-        for(int n = 0; n < 2; ++n)
-            r.ptr[n] = cast(ushort)sa.array[n];
-        return cast(__m128i)r;
-    }
-    else
-    {
-        // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
-        short8 sa = cast(short8)a;
-        long2 r;
-        r.ptr[0] = cast(ushort)sa.array[0];
-        r.ptr[1] = cast(ushort)sa.array[1];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
-    long2 C = cast(long2) _mm_cvtepu16_epi64(A);
-    long[2] correct = [65535, 0];
-    assert(C.array == correct);
-}
-
-
-/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
-__m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
-    }
-    else
-    {
-        // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
-        //     arm64: generates ushll since LDC 1.12 -O1
-        int4 sa = cast(int4)a;
-        long2 r;
-        r.ptr[0] = cast(uint)sa.array[0];
-        r.ptr[1] = cast(uint)sa.array[1];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
-    long2 C = cast(long2) _mm_cvtepu32_epi64(A);
-    long[2] correct = [4294967295, 42];
-    assert(C.array == correct);
-}
-
-
-/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
-__m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
-    }
-    else static if (LDC_with_optimizations)
-    {
-        enum ir = `
-            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-            %r = zext <8 x i8> %v to <8 x i16>
-            ret <8 x i16> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
-    }
-    else
-    {
-        return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-    short8 C = cast(short8) _mm_cvtepu8_epi16(A);
-    short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
-    assert(C.array == correct);
-}
-
-
-/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
-__m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        alias ubyte16 = __vector(ubyte[16]);
-        return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        // LDC arm64: a bit better than below in -O2
-        byte16 sa = cast(byte16)a;
-        int4 r;
-        for(int n = 0; n < 4; ++n) 
-            r.ptr[n] = cast(ubyte)sa.array[n];
-        return cast(__m128i)r;
-    }
-    else
-    {
-        // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
-        // PERF: catastrophic with GDC without SSE4.1
-        byte16 sa = cast(byte16)a;
-        int4 r;
-        r.ptr[0] = cast(ubyte)sa.array[0];
-        r.ptr[1] = cast(ubyte)sa.array[1];
-        r.ptr[2] = cast(ubyte)sa.array[2];
-        r.ptr[3] = cast(ubyte)sa.array[3];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-    int4 C = cast(int4) _mm_cvtepu8_epi32(A);
-    int[4] correct = [127, 128, 1, 255];
-    assert(C.array == correct);
-}
-
-/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
-__m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        alias ubyte16 = __vector(ubyte[16]);
-        return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        // LDC arm64: this optimizes better than the loop below
-        byte16 sa = cast(byte16)a;
-        long2 r;
-        for (int n = 0; n < 2; ++n)
-            r.ptr[n] = cast(ubyte)sa.array[n];
-        return cast(__m128i)r;
-    }
-    else
-    {
-        // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
-        byte16 sa = cast(byte16)a;
-        long2 r;
-        r.ptr[0] = cast(ubyte)sa.array[0];
-        r.ptr[1] = cast(ubyte)sa.array[1];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
-    long2 C = cast(long2) _mm_cvtepu8_epi64(A);
-    long[2] correct = [127, 254];
-    assert(C.array == correct);
-}
-
-/// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
-/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
-/// store the sum in dst using the low 4 bits of `imm8`.
-__m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_dppd(a, b, imm8 & 0x33);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_dppd(a, b, imm8 & 0x33);
-    }
-    else
-    {
-        __m128d zero = _mm_setzero_pd();
-        __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
-        double sum = temp.array[0] + temp.array[1];
-        return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(1.0, 2.0);
-    __m128d B = _mm_setr_pd(4.0, 8.0);
-    double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
-    double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
-    double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
-    double[2] correct1 = [ 4.0,  4.0];
-    double[2] correct2 = [16.0,  0.0];
-    double[2] correct3 = [ 0.0, 20.0];
-    assert(R1.array == correct1);
-    assert(R2.array == correct2);
-    assert(R3.array == correct3);
-}
-
-/// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
-/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
-/// and conditionally store the sum in result using the low 4 bits of `imm8`.
-__m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
-{
-      // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_dpps(a, b, cast(byte)imm8);
-    }
-    else
-    {
-        __m128 zero = _mm_setzero_ps();
-        __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
-        float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
-        return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
-    }        
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
-    __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
-    float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
-    float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
-    float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
-    float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
-    float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
-    float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
-    assert(R1.array == correct1);
-    assert(R2.array == correct2);
-    assert(R3.array == correct3);
-}
-
-
-/// Extract a 32-bit integer from `a`, selected with `imm8`.
-int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
-{
-    return (cast(int4)a).array[imm8 & 3];
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
-    assert(_mm_extract_epi32(A, 0) == 1);
-    assert(_mm_extract_epi32(A, 1 + 8) == 2);
-    assert(_mm_extract_epi32(A, 3 + 4) == 4);
-}
-
-/// Extract a 64-bit integer from `a`, selected with `imm8`.
-long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
-{
-    long2 la = cast(long2)a;
-    return la.array[imm8 & 1];
-}
-unittest
-{
-    __m128i A = _mm_setr_epi64(45, -67);
-    assert(_mm_extract_epi64(A, 0) == 45);
-    assert(_mm_extract_epi64(A, 1) == -67);
-    assert(_mm_extract_epi64(A, 2) == 45);
-}
-
-/// Extract an 8-bit integer from `a`, selected with `imm8`.
-/// Warning: the returned value is zero-extended to 32-bits.
-int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
-{
-    byte16 ba = cast(byte16)a;
-    return cast(ubyte) ba.array[imm8 & 15];
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
-    assert(_mm_extract_epi8(A, 7) == 7);
-    assert(_mm_extract_epi8(A, 13) == 255);
-    assert(_mm_extract_epi8(A, 7 + 16) == 7);
-}
-
-/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
-/// Note: returns a 32-bit $(I integer).
-int _mm_extract_ps (__m128 a, const int imm8) @trusted
-{
-    return (cast(int4)a).array[imm8 & 3];
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
-    assert(_mm_extract_ps(A, 0) == 0x3f800000);
-    assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
-    assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
-}
-
-
-
-/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
-/// integer value, and store the results as packed double-precision floating-point elements.
-__m128d _mm_floor_pd (__m128d a) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        // LDC arm64 acceptable since 1.8 -O2
-        long2 l = vcvtmq_s64_f64(a);
-        double2 r;
-        r.ptr[0] = l.array[0];
-        r.ptr[1] = l.array[1];
-        return r;
-    }
-    else
-    {
-        return _mm_round_pd!1(a);
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(1.3f, -2.12f);
-    __m128d B = _mm_setr_pd(53.6f, -2.7f);
-    A = _mm_floor_pd(A);
-    B = _mm_floor_pd(B);
-    double[2] correctA = [1.0, -3.0];
-    double[2] correctB = [53.0, -3.0];
-    assert(A.array == correctA);
-    assert(B.array == correctB);
-}
-
-/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
-/// integer value, and store the results as packed single-precision floating-point elements.
-__m128 _mm_floor_ps (__m128 a) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        // LDC arm64 acceptable since 1.8 -O1
-        int4 l = vcvtmq_s32_f32(a);
-        float4 r;
-        r.ptr[0] = l.array[0];
-        r.ptr[1] = l.array[1];
-        r.ptr[2] = l.array[2];
-        r.ptr[3] = l.array[3];
-        return r;
-    }
-    else
-    {
-        return _mm_round_ps!1(a);
-    }
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
-    __m128 C = _mm_floor_ps(A);
-    float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
-    assert(C.array == correct);
-}
-
-/// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
-/// integer value, store the result as a double-precision floating-point element in the 
-/// lower element, and copy the upper element from `a` to the upper element.
-__m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        a[0] = vcvtms_s64_f64(b[0]);
-        return a;
-    }
-    else
-    {
-        return _mm_round_sd!1(a, b);
-    }
-}
-unittest
-{
-    __m128d A = _mm_setr_pd(1.3, -2.12);
-    __m128d B = _mm_setr_pd(-53.1, -3.7);
-    __m128d C = _mm_floor_sd(A, B);
-    double[2] correct = [-54.0, -2.12];
-    assert(C.array == correct);
-}
-
-/// Round the lower single-precision (32-bit) floating-point element in `b` down to an
-/// integer value, store the result as a single-precision floating-point element in the
-/// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
-__m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
-{
-    static if (LDC_with_ARM64)
-    {
-        a[0] = vcvtms_s32_f32(b[0]);
-        return a;
-    }
-    else
-    {
-        return _mm_round_ss!1(a, b);
-    }
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
-    __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
-    __m128 C = _mm_floor_ss(A, B);
-    float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
-    assert(C.array == correct);
-}
-
-/// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
-__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
-{
-    // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
-    // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
-    // LDC arm64: ins.s since LDC 1.8 -O2
-    int4 ia = cast(int4)a;
-    ia.ptr[imm8 & 3] = i;
-    return cast(__m128i)ia; 
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
-    int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
-    int[4] result = [1, 2, 5, 4];
-    assert(C.array == result);
-}
-
-/// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
-__m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
-{
-    // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
-    // LDC x86: always do something sensible.
-    long2 la = cast(long2)a;
-    la.ptr[imm8 & 1] = i;
-    return cast(__m128i)la;
-}
-unittest
-{
-    __m128i A = _mm_setr_epi64(1, 2);
-    long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
-    long[2] result = [1, 5];
-    assert(C.array == result);
-}
-
-/// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
-/// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
-__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
-{
-    // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
-    // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
-    byte16 ba = cast(byte16)a;
-    ba.ptr[imm8 & 15] = cast(byte)i;
-    return cast(__m128i)ba; 
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
-    byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-    assert(C.array == result);
-}
-
-
-/// Warning: of course it does something totally different from `_mm_insert_epi32`!
-/// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
-/// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
-/// (elements are zeroed out when the corresponding bit is set).
-__m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
-    }
-    else
-    {
-        float4 tmp2 = a;
-        float tmp1 = b.array[(imm8 >> 6) & 3];
-        tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
-        return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
-    }
-}
-unittest
-{
-    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
-    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
-    __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
-    float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
-    assert(C.array == correct);
-}
-
-
-/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
-__m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted
-{
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
-    }
-    else version(LDC)
-    {
-        // x86: pmaxsd since LDC 1.1 -O1
-        // ARM: smax.4s since LDC 1.8 -01
-        int4 sa = cast(int4)a;
-        int4 sb = cast(int4)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            int4 greater = sa > sb;
-        else
-            int4 greater = greaterMask!int4(sa, sb);
-        return cast(__m128i)( (greater & sa) | (~greater & sb) );
-    }
-    else
-    {
-        __m128i higher = _mm_cmpgt_epi32(a, b);
-        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
-        __m128i mask = _mm_and_si128(aTob, higher);
-        return _mm_xor_si128(b, mask);
-    }
-}
-unittest
-{
-    int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
-                                      _mm_setr_epi32(        -4,-8,  9, -8));
-    int[4] correct =                               [0x7fffffff, 1,  9,  7];
-    assert(R.array == correct);
-}
-
-/// Compare packed signed 8-bit integers in `a` and `b`, 
-/// and return packed maximum values.
-__m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
-    }
-    else version(LDC)
-    {
-        // x86: pmaxsb since LDC 1.1 -O1
-        // ARM64: smax.16b since LDC 1.8.0 -O1
-        byte16 sa = cast(byte16)a;
-        byte16 sb = cast(byte16)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            byte16 greater = sa > sb;
-        else
-            byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
-        return cast(__m128i)( (greater & sa) | (~greater & sb) );
-    }
-    else
-    {
-        __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
-        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
-        __m128i mask = _mm_and_si128(aTob, lower);
-        return _mm_xor_si128(b, mask);
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
-    __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
-    byte16 R = cast(byte16) _mm_max_epi8(A, B);
-    byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
-    assert(R.array == correct);
-}
-
-/// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
-__m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
-    }
-    else version(LDC)
-    {
-        // x86: pmaxuw since LDC 1.1 -O1
-        // ARM64: umax.8h since LDC 1.8.0 -O1
-        // PERF: without sse4.1, LLVM 12 produces a very interesting
-        //          psubusw xmm0, xmm1
-        //          paddw   xmm0, xmm1
-        //       sequence that maybe should go in other min/max intrinsics? 
-        ushort8 sa = cast(ushort8)a;
-        ushort8 sb = cast(ushort8)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-        {
-            // Note: doesn't work well with GDC, which prefers the builtin.
-            ushort8 greater = sa > sb;
-        }
-        else
-            ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
-        return cast(__m128i)( (greater & sa) | (~greater & sb) );
-    }
-    else
-    {
-        b = _mm_subs_epu16(b, a);
-        b = _mm_add_epi16(b, a);
-        return b;
-    }
-}
-unittest
-{
-    short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
-                                          _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
-    short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
-    assert(R.array == correct);
-}
-
-/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
-__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
-    }
-    else version(LDC)
-    {
-        // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
-        // ARM64: umax.4s since LDC 1.8.0 -O1
-        uint4 sa = cast(uint4)a;
-        uint4 sb = cast(uint4)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            uint4 greater = sa > sb;
-        else
-            uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
-        return cast(__m128i)( (greater & sa) | (~greater & sb) );
-    }
-    else
-    {
-        // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
-        /+
-        movdqa  xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
-        movdqa  xmm3, xmm1
-        pxor    xmm3, xmm2
-        pxor    xmm2, xmm0
-        pcmpgtd xmm2, xmm3
-        pand    xmm0, xmm2
-        pandn   xmm2, xmm1
-        por     xmm0, xmm2
-        +/
-        __m128i valueShift = _mm_set1_epi32(-0x80000000);
-        __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
-        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
-        __m128i mask = _mm_and_si128(aTob, higher);
-        return _mm_xor_si128(b, mask);
-    }
-}
-unittest
-{
-    int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
-                                      _mm_setr_epi32(        -4,-8,  9, -8));
-    int[4] correct =                                [        -4,-8,  9, -7];
-    assert(R.array == correct);
-}
-
-/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
-__m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
-    }
-    else version(LDC)
-    {
-        // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
-        // ARM: smin.4s since LDC 1.8 -01
-        int4 sa = cast(int4)a;
-        int4 sb = cast(int4)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            int4 greater = sa > sb;
-        else
-            int4 greater = greaterMask!int4(sa, sb);
-        return cast(__m128i)( (~greater & sa) | (greater & sb) );
-    }
-    else
-    {
-        __m128i higher = _mm_cmplt_epi32(a, b);
-        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
-        __m128i mask = _mm_and_si128(aTob, higher);
-        return _mm_xor_si128(b, mask);
-    }
-}
-unittest
-{
-    int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
-                                      _mm_setr_epi32(        -4, -8,  9, -8));
-    int[4] correct =                               [         -4, -8, -4, -8];
-    assert(R.array == correct);
-}
-
-/// Compare packed signed 8-bit integers in `a` and `b`, 
-/// and return packed minimum values.
-__m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
-    }
-    else version(LDC)
-    {
-        // x86: pminsb since LDC 1.1 -O1
-        // ARM64: smin.16b since LDC 1.8.0 -O1
-        byte16 sa = cast(byte16)a;
-        byte16 sb = cast(byte16)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            byte16 greater = sa > sb;
-        else
-            byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
-        return cast(__m128i)( (~greater & sa) | (greater & sb) );
-    }
-    else
-    {
-        __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
-        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
-        __m128i mask = _mm_and_si128(aTob, lower);
-        return _mm_xor_si128(b, mask);
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
-    __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
-    byte16 R = cast(byte16) _mm_min_epi8(A, B);
-    byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
-    assert(R.array == correct);
-}
-
-/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
-__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
-    }
-    else version(LDC)
-    {
-        // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
-        // ARM64: umin.8h since LDC 1.8.0 -O1
-        ushort8 sa = cast(ushort8)a;
-        ushort8 sb = cast(ushort8)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            ushort8 greater = (sb > sa);
-        else
-            ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
-        return cast(__m128i)( (greater & sa) | (~greater & sb) );
-    }
-    else
-    {
-        __m128i c = _mm_subs_epu16(b, a);
-        b = _mm_sub_epi16(b, c);
-        return b;
-    }
-}
-unittest
-{
-    short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
-                                          _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
-    short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
-    assert(R.array == correct);
-}
-
-/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
-__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
-    }
-    else version(LDC)
-    {
-        // x86: pminud since LDC 1.1 -O1, also good without sse4.1
-        // ARM64: umin.4s since LDC 1.8.0 -O1
-        uint4 sa = cast(uint4)a;
-        uint4 sb = cast(uint4)b;
-        static if (SIMD_COMPARISON_MASKS_16B)
-            uint4 greater = sa > sb;
-        else
-            uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
-        return cast(__m128i)( (~greater & sa) | (greater & sb) );
-    }
-    else
-    {
-        // PERF: same remark as in _mm_max_epu32
-        __m128i valueShift = _mm_set1_epi32(-0x80000000);
-        __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
-        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
-        __m128i mask = _mm_and_si128(aTob, higher);
-        return _mm_xor_si128(b, mask);
-    }
-}
-unittest
-{
-    int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
-                                      _mm_setr_epi32(        -4,-8,  9, -8));
-    int[4] correct =                                [0x7fffffff, 1,  4, -8];
-    assert(R.array == correct);
-}
-
-/// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
-/// store the minimum and index in return value, and zero the remaining bits.
-__m128i _mm_minpos_epu16 (__m128i a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
-        __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
-        __m128i best = _mm_min_epu32(combinedLo, combinedHi);
-        best = _mm_min_epu32(best, _mm_srli_si128!8(best));
-        best = _mm_min_epu32(best, _mm_srli_si128!4(best));
-        short8 sbest = cast(short8)best;
-        short8 r;
-        r[0] = sbest[1];
-        r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
-        r[2] = 0;
-        r[3] = 0;
-        r[4] = 0;
-        r[5] = 0;
-        r[6] = 0;
-        r[7] = 0;
-        return cast(__m128i)r;
-    }
-    else
-    {
-        short8 sa = cast(short8)a;
-        ushort min = 0xffff;
-        int index = 0;
-        for(int n = 0; n < 8; ++n)
-        {
-            ushort c = sa.array[n];
-            if (c < min)
-            {
-                min = c;
-                index = n;
-            }
-        }
-        short8 r;
-        r.ptr[0] = min;
-        r.ptr[1] = cast(short)index;
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
-    __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
-    short8 R1 = cast(short8) _mm_minpos_epu16(A);
-    short8 R2 = cast(short8) _mm_minpos_epu16(B);
-    short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
-    short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
-    assert(R1.array == correct1);
-    assert(R2.array == correct2);
-}
-
-/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
-/// in `a` compared to those in `b`, and store the 16-bit results in dst. 
-/// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
-/// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
-/// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
-/// at the offset specified in `imm8[2]`.
-__m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
-    }
-    else
-    {
-        int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
-        int b_offset = (imm8 & 3) * 4;
-
-        byte16 ba = cast(byte16)a;
-        byte16 bb = cast(byte16)b;
-        short8 r;
-
-        __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
-
-        for (int j = 0; j < 8; j += 2)
-        {
-            int k = a_offset + j;
-            __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
-                                           0, 0, 0, 0, 
-                                           ba[k+1], ba[k+2], ba[k+3], ba[k+4],
-                                           0, 0, 0, 0);
-            short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
-            r.ptr[j] = diffs.array[0];
-            r.ptr[j+1] = diffs.array[4];
-        }
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
-    __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
-    short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
-    short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
-    short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
-    short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
-    short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
-    short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
-    short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
-    short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
-    short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
-    short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
-    assert(r1.array == correct1);
-    assert(r4.array == correct4);
-    assert(r5.array == correct5);
-    assert(r7.array == correct7);
-    assert(r8.array == correct0);
-}
-
-/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
-__m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
-    }
-    else static if (LDC_with_SSE41 && LDC_with_optimizations)
-    {
-        // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
-        // Use IR instead.
-        // This generates pmuldq with since LDC 1.2.0 -O0 
-        enum ir = `
-            %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
-            %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
-            %la = sext <2 x i32> %ia to <2 x i64>
-            %lb = sext <2 x i32> %ib to <2 x i64>
-            %r = mul <2 x i64> %la, %lb
-            ret <2 x i64> %r`;
-        return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
-    }
-    else static if (LDC_with_ARM64)  
-    {
-        // 3 instructions since LDC 1.8 -O2
-        // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
-        int2 a_lo = vmovn_s64(cast(long2)a);
-        int2 b_lo = vmovn_s64(cast(long2)b);
-        return cast(__m128i) vmull_s32(a_lo, b_lo);
-    }
-    else
-    {
-        int4 ia = cast(int4)a;
-        int4 ib = cast(int4)b;
-        long2 r;
-        r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
-        r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
-        return cast(__m128i)r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
-    __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
-    long2 R = cast(long2) _mm_mul_epi32(A, B);
-    long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
-    assert(R.array == correct);
-}
-
-/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
-/// return the low 32 bits of the intermediate integers.
-__m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    // PERF GDC without SSE4.1 could be better
-    static if (GDC_with_SSE41)
-    {
-        int4 ia = cast(int4)a;
-        int4 ib = cast(int4)b;
-        // Note: older GDC doesn't have that op, but older GDC
-        // also has no support for -msse4.1 detection
-        return cast(__m128i)(a * b); 
-    }
-    else version(LDC)
-    {
-        int4 ia = cast(int4)a;
-        int4 ib = cast(int4)b;
-        return cast(__m128i)(a * b);
-    }
-    else
-    {
-        // DMD doesn't take the above
-        int4 ia = cast(int4)a;
-        int4 ib = cast(int4)b;
-        int4 r;
-        r.ptr[0] = ia.array[0] * ib.array[0];
-        r.ptr[1] = ia.array[1] * ib.array[1];
-        r.ptr[2] = ia.array[2] * ib.array[2];
-        r.ptr[3] = ia.array[3] * ib.array[3];
-        return r;
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
-    __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
-    int4 R = cast(int4) _mm_mullo_epi32(A, B);
-    int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
-    assert(R.array == correct);
-}
-
-
-/// Convert packed signed 32-bit integers from `a` and `b` 
-/// to packed 16-bit integers using unsigned saturation.
-__m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted
-{
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
-    }
-    else static if (LDC_with_ARM64)
-    {
-       int4 z;
-       z = 0;       
-       return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
-                                         vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
-    }
-    else
-    {
-        __m128i i32768 = _mm_set1_epi32(32768);
-        __m128i s32768 = _mm_set1_epi16(-32768);
-        a = _mm_sub_epi32(a, i32768);
-        b = _mm_sub_epi32(b, i32768);
-        __m128i clampedSigned = _mm_packs_epi32(a, b);
-        return _mm_add_epi16(clampedSigned, s32768);
-    }
-}
-unittest
-{
-    __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
-    short8 R = cast(short8) _mm_packus_epi32(A, A);
-    short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
-    assert(R.array == correct);
-}
-
-
-/// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
-/// rounding parameter, and store the results as packed double-precision floating-point elements.
-/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-__m128d _mm_round_pd(int rounding)(__m128d a) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_roundpd(a, rounding);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_roundpd(a, rounding);
-    }
-    else
-    {
-        static if (rounding & _MM_FROUND_CUR_DIRECTION)
-        {
-            // Convert to 64-bit integers
-            long lo = _mm_cvtsd_si64(a);
-            a.ptr[0] = a.array[1];
-            long hi = _mm_cvtsd_si64(a);
-            return _mm_setr_pd(lo, hi);
-        }
-        else
-        {
-            version(GNU) pragma(inline, false); // else fail unittest with optimizations
-
-            uint old = _MM_GET_ROUNDING_MODE();
-            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
-            
-            // Convert to 64-bit integers
-            long lo = _mm_cvtsd_si64(a);
-            a.ptr[0] = a.array[1];
-            long hi = _mm_cvtsd_si64(a);
-
-            // Convert back to double to achieve the rounding
-            // The problem is that a 64-bit double can't represent all the values 
-            // a 64-bit integer can (and vice-versa). So this function won't work for
-            // large values. (MAYDO: what range exactly?)
-            _MM_SET_ROUNDING_MODE(old);
-            return _mm_setr_pd(lo, hi);
-        }
-    }
-}
-unittest
-{
-    // tested in other intrinsics
-}
-
-/// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
-/// rounding parameter, and store the results as packed single-precision floating-point elements.
-/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-__m128 _mm_round_ps(int rounding)(__m128 a) @trusted
-{
-    // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
-    static if (GDC_or_LDC_with_SSE41)
-    {
-        return __builtin_ia32_roundps(a, rounding);
-    }
-    else
-    {
-        static if (rounding & _MM_FROUND_CUR_DIRECTION)
-        {
-            __m128i integers = _mm_cvtps_epi32(a);
-            return _mm_cvtepi32_ps(integers);
-        }
-        else
-        {
-            version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
-            uint old = _MM_GET_ROUNDING_MODE();
-            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
-            scope(exit) _MM_SET_ROUNDING_MODE(old);
-
-            // Convert to 64-bit integers
-            __m128i integers = _mm_cvtps_epi32(a);
-
-            // Convert back to float to achieve the rounding
-            // The problem is that a 32-float can't represent all the values 
-            // a 32-bit integer can (and vice-versa). So this function won't work for
-            // large values. (MAYDO: what range exactly?)
-            __m128 result = _mm_cvtepi32_ps(integers);
-
-            return result;
-        }
-    }
-}
-unittest
-{
-    // tested in other intrinsics
-}
-
-
-/// Round the lower double-precision (64-bit) floating-point element in `b` using the
-/// rounding parameter, store the result as a double-precision floating-point element 
-/// in the lower element of result, and copy the upper element from `a` to the upper element of result.
-/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-__m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
-{
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_roundsd(a, b, rounding);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_roundsd(a, b, rounding);
-    }
-    else
-    {
-        static if (rounding & _MM_FROUND_CUR_DIRECTION)
-        {
-            // Convert to 64-bit integer
-            long b0 = _mm_cvtsd_si64(b);
-            a.ptr[0] = b0;
-            return a;
-        }
-        else
-        {
-            version(GNU) pragma(inline, false); // else fail unittest with optimizations
-
-            uint old = _MM_GET_ROUNDING_MODE();
-            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
-            
-            // Convert to 64-bit integer
-            long b0 = _mm_cvtsd_si64(b);
-            a.ptr[0] = b0;
-
-            // Convert back to double to achieve the rounding
-            // The problem is that a 64-bit double can't represent all the values 
-            // a 64-bit integer can (and vice-versa). So this function won't work for
-            // large values. (MAYDO: what range exactly?)
-            _MM_SET_ROUNDING_MODE(old);
-            return a;
-        }
-    }
-}
-unittest
-{
-    // tested in other intrinsics
-}
-
-
-/// Round the lower single-precision (32-bit) floating-point element in `b` using the 
-/// rounding parameter, store the result as a single-precision floating-point element 
-/// in the lower element of result, and copy the upper 3 packed elements from `a`
-/// to the upper elements of result.
-/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
-__m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
-{
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_roundss(a, b, rounding);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_roundss(a, b, rounding);
-    }
-    else
-    {
-        static if (rounding & _MM_FROUND_CUR_DIRECTION)
-        {
-            int b0 = _mm_cvtss_si32(b);
-            a.ptr[0] = b0;   
-            return a;
-        }
-        else version(GNU)
-        {
-            pragma(inline, false)
-            __m128 GDCworkaround() nothrow @nogc @trusted 
-            {
-                uint old = _MM_GET_ROUNDING_MODE();
-                _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
-
-                // Convert to 32-bit integer
-                int b0 = _mm_cvtss_si32(b);
-                a.ptr[0] = b0;       
-
-                // Convert back to double to achieve the rounding
-                // The problem is that a 32-bit float can't represent all the values 
-                // a 32-bit integer can (and vice-versa). So this function won't work for
-                // large values. (MAYDO: what range exactly?)
-                _MM_SET_ROUNDING_MODE(old);
-                return a;
-            }
-            return GDCworkaround();
-        }
-        else
-        {
-            uint old = _MM_GET_ROUNDING_MODE();
-            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
-
-            // Convert to 32-bit integer
-            int b0 = _mm_cvtss_si32(b);
-            a.ptr[0] = b0;       
-
-            // Convert back to double to achieve the rounding
-            // The problem is that a 32-bit float can't represent all the values 
-            // a 32-bit integer can (and vice-versa). So this function won't work for
-            // large values. (MAYDO: what range exactly?)
-            _MM_SET_ROUNDING_MODE(old);
-            return a;
-        }
-    }
-}
-unittest
-{
-    // tested in other intrinsics
-}
-
-
-/// Load 128-bits of integer data from memory using a non-temporal memory hint. 
-/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
-/// exception may be generated.
-__m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted
-{
-    // PERF DMD D_SIMD
-    static if (GDC_with_SSE41)
-    {
-        return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
-    }
-    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
-    {
-        enum prefix = `!0 = !{ i32 1 }`;
-        enum ir = `
-            %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
-            ret <4 x i32> %r`;
-        return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr);
-    }
-    else
-    {
-        return *cast(__m128i*)mem_addr; // regular move instead
-    }
-}
-unittest
-{
-    align(16) static immutable int[4] correct = [1, 2, 3, 4];
-    __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr));
-    _mm_mfence();
-    assert(A.array == correct);
-}
-
-/// Return 1 if all bits in `a` are all 1's. Else return 0.
-int _mm_test_all_ones (__m128i a) @safe
-{
-    return _mm_testc_si128(a, _mm_set1_epi32(-1));
-}
-unittest
-{
-    __m128i A = _mm_set1_epi32(-1);
-    __m128i B = _mm_set_epi32(-1, -2, -1, -1);
-    assert(_mm_test_all_ones(A) == 1);
-    assert(_mm_test_all_ones(B) == 0);
-}
-
-/// Return 1 if all bits in `a` are all 0's. Else return 0.
-// This is a #BONUS since it was lacking in Intel Intrinsics API.
-int _mm_test_all_zeros (__m128i a) @safe
-{
-    return _mm_testz_si128(a, _mm_set1_epi32(-1));
-}
-unittest
-{
-    __m128i A = _mm_set1_epi32(0);
-    __m128i B = _mm_set_epi32(0, 8, 0, 0);
-    assert(_mm_test_all_zeros(A) == 1);
-    assert(_mm_test_all_zeros(B) == 0);
-}
-
-/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
-/// and return 1 if the result is zero, otherwise return 0.
-int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
-{
-    return _mm_testz_si128(a, mask); // it's really the same, but with a good name
-}
-
-/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
-/// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
-/// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
-/// CF values are zero, otherwise return 0.
-int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
-{
-    return _mm_testnzc_si128(a, mask);
-}
-
-/// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
-/// result is zero, otherwise return 0.
-/// In other words, test if all bits masked by `b` are 1 in `a`.
-int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        // Acceptable since LDC 1.8 -02
-        long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
-        return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-    }
-    else
-    {
-        __m128i c = ~a & b;
-        int[4] zero = [0, 0, 0, 0];
-        return c.array == zero;
-    }
-}
-unittest
-{
-    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
-    __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
-    __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
-    assert(_mm_testc_si128(A, A) == 1);
-    assert(_mm_testc_si128(A, M1) == 0);
-    assert(_mm_testc_si128(A, M2) == 1);
-}
-
-/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
-/// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
-/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
-/// result is zero, otherwise set CF to 0. 
-/// Return 1 if both the ZF and CF values are zero, otherwise return 0.
-int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
-        long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
-
-        return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
-                | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
-    }
-    else
-    {
-        __m128i c = a & b;
-        __m128i d = ~a & b;
-        int[4] zero = [0, 0, 0, 0];
-        return !( (c.array == zero) || (d.array == zero));
-    }    
-}
-unittest
-{
-    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
-    __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
-    __m128i Z = _mm_setzero_si128();
-    assert(_mm_testnzc_si128(A, Z) == 0);
-    assert(_mm_testnzc_si128(A, M) == 1);
-    assert(_mm_testnzc_si128(A, A) == 0);
-}
-
-/// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
-/// and return 1 if the result is zero, otherwise return 0.
-/// In other words, test if all bits masked by `b` are 0 in `a`.
-int _mm_testz_si128 (__m128i a, __m128i b) @trusted
-{
-    // PERF DMD
-    static if (GDC_with_SSE41)
-    {
-        return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
-    }
-    else static if (LDC_with_SSE41)
-    {
-        return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
-    }
-    else static if (LDC_with_ARM64)
-    {
-        // Acceptable since LDC 1.8 -02
-        long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
-        return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-    }
-    else 
-    {
-        __m128i c = a & b;
-        int[4] zero = [0, 0, 0, 0];
-        return c.array == zero;
-    }    
-}
-unittest
-{
-    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
-    __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
-    __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
-    assert(_mm_testz_si128(A, A) == 0);
-    assert(_mm_testz_si128(A, M1) == 1);
-    assert(_mm_testz_si128(A, M2) == 0);
-}
-
+/**
+* SSE4.1 intrinsics.
+* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE4_1
+*
+* Copyright: Guillaume Piolat 2021.
+*            Johan Engelen 2021.
+*            cet 2024.
+* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
+*/
+module inteli.smmintrin;
+
+// SSE4.1 instructions
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=SSE4_1
+// Note: this header will work whether you have SSE4.1 enabled or not.
+// With LDC, use "dflags-ldc": ["-mattr=+sse4.1"] or equivalent to actively
+// generate SSE4.1 instructions.
+// With GDC, use "dflags-gdc": ["-msse4.1"] or equivalent to generate SSE4.1 instructions.
+
+public import inteli.types;
+import inteli.internals;
+
+// smmintrin pulls in all previous instruction set intrinsics.
+public import inteli.tmmintrin;
+
+nothrow @nogc:
+
+enum int _MM_FROUND_TO_NEAREST_INT = 0x00; /// SSE4.1 rounding modes
+enum int _MM_FROUND_TO_NEG_INF     = 0x01; /// ditto
+enum int _MM_FROUND_TO_POS_INF     = 0x02; /// ditto
+enum int _MM_FROUND_TO_ZERO        = 0x03; /// ditto
+enum int _MM_FROUND_CUR_DIRECTION  = 0x04; /// ditto
+enum int _MM_FROUND_RAISE_EXC      = 0x00; /// ditto
+enum int _MM_FROUND_NO_EXC         = 0x08; /// ditto
+
+enum int _MM_FROUND_NINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
+enum int _MM_FROUND_FLOOR     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
+enum int _MM_FROUND_CEIL      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
+enum int _MM_FROUND_TRUNC     = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
+enum int _MM_FROUND_RINT      = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
+enum int _MM_FROUND_NEARBYINT = (_MM_FROUND_NO_EXC    | _MM_FROUND_CUR_DIRECTION);
+
+/// Add packed signed 32-bit integers in `a` and `b` using saturation.
+/// #BONUS
+__m128i _mm_adds_epi32(__m128i a, __m128i b) pure
+{
+    // PERF: ARM64 should use 2x vqadd_s32
+    static if (LDC_with_saturated_intrinsics)
+        return cast(__m128i)inteli_llvm_adds!int4(cast(int4)a, cast(int4)b);
+    else
+    {
+        __m128i int_max = _mm_set1_epi32(0x7FFFFFFF);
+        __m128i res = _mm_add_epi32(a, b);
+        __m128i sign_bit = _mm_srli_epi32(a, 31);
+        __m128i sign_xor  = _mm_xor_si128(a, b);
+        __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a, res));
+        __m128i saturated = _mm_add_epi32(int_max, sign_bit);
+        return cast(__m128i) _mm_blendv_ps(cast(__m128)res, 
+            cast(__m128)saturated, 
+            cast(__m128)overflow);
+    }
+}
+unittest
+{
+    __m128i a = _mm_setr_epi32(int.max, 1, 2, int.min);
+    __m128i b = _mm_setr_epi32(1, 2, 3, -4);
+    assert(_mm_adds_epi32(a, b).array == [int.max, 3, 5, int.min]);
+}
+
+/// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`, and store the results.
+// Note: changed signature, GDC needs a compile-time value for imm8.
+__m128i _mm_blend_epi16(int imm8)(__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        pragma(inline, true); // else wouldn't inline in _mm256_blend_epi16
+        return cast(__m128i) __builtin_ia32_pblendw128(cast(short8)a, cast(short8)b, imm8);
+    }
+    else 
+    {
+        // LDC x86 This generates pblendw since LDC 1.1 and -O2
+        short8 r;
+        short8 sa = cast(short8)a;
+        short8 sb = cast(short8)b;
+        for (int n = 0; n < 8; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? sb.array[n] : sa.array[n];
+        }
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(0, 1,  2,  3,  4,  5,  6,  7);
+    __m128i B = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+    short8 C = cast(short8) _mm_blend_epi16!147(A, B); // 10010011
+    short[8] correct =        [8, 9,  2,  3, 12,  5,  6, 15];
+    assert(C.array == correct);
+}
+
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using control mask `imm8`.
+// Note: changed signature, GDC needs a compile-time value for `imm8`.
+__m128d _mm_blend_pd(int imm8)(__m128d a, __m128d b) @trusted
+{
+    static assert(imm8 >= 0 && imm8 < 4);
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(double2) __builtin_ia32_blendpd(cast(double2)a, cast(double2)b, imm8);
+    }
+    else
+    {
+        // LDC x86: blendpd since LDC 1.1 -02, uses blendps after LDC 1.12
+        double2 r;
+        for (int n = 0; n < 2; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
+        }
+        return cast(__m128d)r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(0, 1);
+    __m128d B = _mm_setr_pd(8, 9);
+    double2 C = _mm_blend_pd!2(A, B);
+    double[2] correct =    [0, 9];
+    assert(C.array == correct);
+}
+
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using control 
+/// mask `imm8`.
+// Note: changed signature, GDC needs a compile-time value for imm8.
+__m128 _mm_blend_ps(int imm8)(__m128 a, __m128 b) pure @trusted
+{
+    // PERF DMD
+    static assert(imm8 >= 0 && imm8 < 16);
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_blendps(a, b, imm8);
+    }
+    else version(LDC)
+    {
+        // LDC x86: generates blendps since LDC 1.1 -O2
+        //   arm64: pretty good, two instructions worst case
+        return shufflevectorLDC!(float4, (imm8 & 1) ? 4 : 0,
+                                         (imm8 & 2) ? 5 : 1,
+                                         (imm8 & 4) ? 6 : 2,
+                                         (imm8 & 8) ? 7 : 3)(a, b);
+    }
+    else
+    {
+        // PERF GDC without SSE4.1 is quite bad
+        __m128 r;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = (imm8 & (1 << n)) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(0, 1,  2,  3);
+    __m128 B = _mm_setr_ps(8, 9, 10, 11);
+    float4 C = cast(float4) _mm_blend_ps!13(A, B); // 1101
+    float[4] correct =    [8, 1, 10, 11];
+    assert(C.array == correct);
+}
+
+/// Blend packed 8-bit integers from `a` and `b` using `mask`.
+/// Select from `b` if the high-order bit of the corresponding 8-bit element in `mask` is set, else select from `a`.
+__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask) pure @trusted
+{
+    // PERF DMD
+    /*static if (GDC_with_SSE41)
+    {
+        // This intrinsic do nothing in GDC 12.
+        // TODO report to GDC. No problem in GCC.
+        return cast(__m128i) __builtin_ia32_pblendvb128 (cast(ubyte16)a, cast(ubyte16)b, cast(ubyte16)mask);
+    }
+    else*/
+    static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pblendvb(cast(byte16)a, cast(byte16)b, cast(byte16)mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: two instructions since LDC 1.12 -O2
+        byte16 maskSX = vshrq_n_s8(cast(byte16)mask, 7);
+        return cast(__m128i) vbslq_s8(maskSX, cast(byte16)b, cast(byte16)a);
+    }
+    else
+    {
+        __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask);
+        return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,  
+                               8,  9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 
+                              24, 25, 26, 27, 28, 29, 30, 31);
+    __m128i M = _mm_setr_epi8( 1, -1,  1,  1, -4,  1, -8,  127,  
+                               1,  1, -1, -1,  4,  1,  8, -128);
+    byte16 R = cast(byte16) _mm_blendv_epi8(A, B, M);
+    byte[16] correct =      [  0, 17,  2,  3, 20,  5, 22,  7,
+                               8,  9, 26, 27, 12, 13, 14, 31 ];
+    assert(R.array == correct);
+}
+
+
+/// Blend packed double-precision (64-bit) floating-point elements from `a` and `b` using `mask`.
+__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE42)
+    {
+        // PERF Amazingly enough, GCC/GDC generates the blendvpd instruction
+        // with -msse4.2 but not -msse4.1.
+        // Not sure what is the reason, and there is a replacement sequence.
+        // Sounds like a bug.
+        return __builtin_ia32_blendvpd(a, b, mask);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_blendvpd(a, b, mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        long2 shift;
+        shift = 63;
+        long2 lmask = cast(long2)mask >> shift;
+        return cast(__m128d) vbslq_s64(lmask, cast(long2)b, cast(long2)a);
+    }
+    else
+    {
+        __m128d r; // PERF =void;
+        long2 lmask = cast(long2)mask;
+        for (int n = 0; n < 2; ++n)
+        {
+            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 2.0);
+    __m128d B = _mm_setr_pd(3.0, 4.0);
+    __m128d M1 = _mm_setr_pd(-3.0, 2.0);
+    __m128d R1 = _mm_blendv_pd(A, B, M1);
+    double[2] correct1 = [3.0, 2.0];
+    assert(R1.array == correct1);
+
+    // Note: wouldn't work with -double.nan, since in some AArch64 archs the NaN sign bit is lost
+    // See Issue #78
+    __m128d M2 = _mm_setr_pd(double.nan, double.infinity);
+    __m128d R2 = _mm_blendv_pd(A, B, M2);
+    double[2] correct2 = [1.0, 2.0];
+    assert(R2.array == correct2);
+}
+
+
+/// Blend packed single-precision (32-bit) floating-point elements from `a` and `b` using `mask`.
+__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_blendvps(a, b, mask);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_blendvps(a, b, mask);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        int4 shift;
+        shift = 31;
+        int4 lmask = cast(int4)mask >> shift;
+        return cast(__m128) vbslq_s32(lmask, cast(int4)b, cast(int4)a);
+    }
+    else
+    {
+        // LDC x86_64: Compiles to 5 instr since LDC 1.27 -O2
+        // If lack of optimization, consider replacing by:
+        //  __m128i overflow_mask = _mm_srai_epi32(overflow, 31);
+        //    return _mm_or_si128(
+        //        _mm_and_si128(overflow_mask, saturated),
+        //        _mm_andnot_si128(overflow_mask, res)
+        // LLVM makes almost the same sequence when optimized.
+        __m128 r;
+        int4 lmask = cast(int4)mask;
+        for (int n = 0; n < 4; ++n)
+        {
+            r.ptr[n] = (lmask.array[n] < 0) ? b.array[n] : a.array[n];
+        }
+        return r;
+    }
+}
+unittest
+{
+    __m128 A  = _mm_setr_ps( 0.0f, 1.0f, 2.0f, 3.0f);
+    __m128 B  = _mm_setr_ps( 4.0f, 5.0f, 6.0f, 7.0f);
+    __m128 M1 = _mm_setr_ps(-3.0f, 2.0f, 1.0f, -10000.0f);
+    __m128 M2 = _mm_setr_ps(float.nan, float.nan, -0.0f, +0.0f);
+    __m128 R1 = _mm_blendv_ps(A, B, M1);
+    __m128 R2 = _mm_blendv_ps(A, B, M2);
+    float[4] correct1 =    [ 4.0f, 1.0f, 2.0f, 7.0f];
+    float[4] correct2 =    [ 0.0f, 1.0f, 6.0f, 3.0f];
+    assert(R1.array == correct1);
+
+    // Note: wouldn't work with -float.nan, since in some AArch64 archs the NaN sign bit is lost
+    // See Issue #78
+    assert(R2.array == correct2);
+}
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` up to an integer value, 
+/// and store the results as packed double-precision floating-point elements.
+__m128d _mm_ceil_pd (__m128d a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O2
+        // Unfortunately x86 intrinsics force a round-trip back to double2
+        // ARM neon semantics wouldn't have that
+        long2 l = vcvtpq_s64_f64(a);
+        double2 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        return r;
+    }
+    else
+    {
+        return _mm_round_pd!2(a);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3f, -2.12f);
+    __m128d B = _mm_setr_pd(53.6f, -2.7f);
+    A = _mm_ceil_pd(A);
+    B = _mm_ceil_pd(B);
+    double[2] correctA = [2.0, -2.0];
+    double[2] correctB = [54.0, -2.0];
+    assert(A.array == correctA);
+    assert(B.array == correctB);
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` up to an integer value, 
+/// and store the results as packed single-precision floating-point elements.
+__m128 _mm_ceil_ps (__m128 a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O1
+        int4 l = vcvtpq_s32_f32(a);
+        float4 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        r.ptr[2] = l.array[2];
+        r.ptr[3] = l.array[3];
+        return r;
+    }
+    else
+    {
+        return _mm_round_ps!2(a);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
+    __m128 C = _mm_ceil_ps(A);
+    float[4] correct = [2.0f, -2.0f, 54.0f, -2.0f];
+    assert(C.array == correct);
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b` up to an integer value, 
+/// store the result as a double-precision floating-point element in the lower element of result, 
+/// and copy the upper element from `a` to the upper element of dst.
+__m128d _mm_ceil_sd (__m128d a, __m128d b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtps_s64_f64(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_sd!2(a, b);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3, -2.12);
+    __m128d B = _mm_setr_pd(53.6, -3.7);
+    __m128d C = _mm_ceil_sd(A, B);
+    double[2] correct = [54.0, -2.12];
+    assert(C.array == correct);
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b` up to an integer value,
+/// store the result as a single-precision floating-point element in the lower element of result, 
+/// and copy the upper 3 packed elements from `a` to the upper elements of result.
+__m128 _mm_ceil_ss (__m128 a, __m128 b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtps_s32_f32(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_ss!2(a, b);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
+    __m128 B = _mm_setr_ps(53.6f, -3.7f, 8.0f, 7.0f);
+    __m128 C = _mm_ceil_ss(A, B);
+    float[4] correct = [54.0f, -2.12f, -4.5f, 1.1f];
+    assert(C.array == correct);
+}
+
+/// Compare packed 64-bit integers in `a` and `b` for equality.
+__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b) @trusted
+{
+    static if (SIMD_COMPARISON_MASKS_16B)
+    {
+        version(DigitalMars)
+        {
+            // DMD doesn't recognize long2 == long2
+            long2 la = cast(long2)a;
+            long2 lb = cast(long2)b;
+            long2 res;
+            res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
+            res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
+            return cast(__m128i)res;
+        }
+        else
+        {
+            return cast(__m128i)(cast(long2)a == cast(long2)b);
+        }
+    }
+    else static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pcmpeqq(cast(long2)a, cast(long2)b);
+    }
+    else version(LDC)
+    {
+        // LDC x86: generates pcmpeqq since LDC 1.1 -O1
+        //     arm64: generates cmeq since LDC 1.8 -O1
+        return cast(__m128i) equalMask!long2(cast(long2)a, cast(long2)b);
+    }
+    else
+    {
+        // Clever pcmpeqd + pand use with LDC 1.24 -O2
+        long2 la = cast(long2)a;
+        long2 lb = cast(long2)b;
+        long2 res;
+        res.ptr[0] = (la.array[0] == lb.array[0]) ? -1 : 0;
+        res.ptr[1] = (la.array[1] == lb.array[1]) ? -1 : 0;
+        return cast(__m128i)res;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(-1, -2);
+    __m128i B = _mm_setr_epi64(-3, -2);
+    __m128i C = _mm_setr_epi64(-1, -4);
+    long2 AB = cast(long2) _mm_cmpeq_epi64(A, B);
+    long2 AC = cast(long2) _mm_cmpeq_epi64(A, C);
+    long[2] correct1 = [0, -1];
+    long[2] correct2 = [-1, 0];
+    assert(AB.array == correct1);
+    assert(AC.array == correct2);
+}
+
+
+/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepi16_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pmovsxwd128(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxwd since LDC 1.1 -O0, also good in arm64
+        enum ir = `
+            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <4 x i32> <i32 0, i32 1,i32 2, i32 3>
+            %r = sext <4 x i16> %v to <4 x i32>
+            ret <4 x i32> %r`;
+        return cast(__m128d) LDCInlineIR!(ir, int4, short8)(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        int4 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        r.ptr[2] = sa.array[2];
+        r.ptr[3] = sa.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepi16_epi32(A);
+    int[4] correct = [-1, 0, -32768, 32767];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepi16_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pmovsxwq128(cast(short8)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxwq since LDC 1.1 -O0, also good in arm64
+        enum ir = `
+            %v = shufflevector <8 x i16> %0,<8 x i16> %0, <2 x i32> <i32 0, i32 1>
+            %r = sext <2 x i16> %v to <2 x i64>
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, short8)(cast(short8)a);
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        long2 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-32768, 32767, 0, 0, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepi16_epi64(A);
+    long[2] correct = [-32768, 32767];
+    assert(C.array == correct);
+}
+
+/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepi32_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i)__builtin_ia32_pmovsxdq128(cast(int4)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxdq since LDC 1.1 -O0, also good in arm64
+        enum ir = `
+            %v = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 1>
+            %r = sext <2 x i32> %v to <2 x i64>
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, int4)(cast(int4)a);
+    }
+    else
+    {
+        int4 sa = cast(int4)a;
+        long2 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-4, 42, 0, 0);
+    long2 C = cast(long2) _mm_cvtepi32_epi64(A);
+    long[2] correct = [-4, 42];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers.
+__m128i _mm_cvtepi8_epi16 (__m128i a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovsxbw128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: pmovsxbw generated since LDC 1.1.0 -O0 
+        // LDC ARM64: sshll generated since LDC 1.8.0 -O1
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+            %r = sext <8 x i8> %v to <8 x i16>
+            ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        byte16 sa = cast(byte16)a;
+        short8 r;
+        foreach(n; 0..8)
+            r.ptr[n] = sa.array[n];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    short8 C = cast(short8) _mm_cvtepi8_epi16(A);
+    short[8] correct = [127, -128, 1, -1, 0, 2, -4, -8];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepi8_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovsxbd128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_SSE41 && LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxbd since LDC 1.1 -O0
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+            %r = sext <4 x i8> %v to <4 x i32>
+            ret <4 x i32> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, int4, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        // LDC ARM64: this gives the same codegen than a vmovl_s16/vmovl_s8 sequence would
+        byte16 sa = cast(byte16)a;
+        int4 r;
+        r.ptr[0] = sa.array[0];
+        r.ptr[1] = sa.array[1];
+        r.ptr[2] = sa.array[2];
+        r.ptr[3] = sa.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepi8_epi32(A);
+    int[4] correct = [127, -128, 1, -1];
+    assert(C.array == correct);
+}
+
+
+/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
+__m128i _mm_cvtepi8_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovsxbq128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        // LDC x86: Generates pmovsxbq since LDC 1.1 -O0, 
+        // LDC arm64: it's ok since LDC 1.8 -O1
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <2 x i32> <i32 0, i32 1>
+            %r = sext <2 x i8> %v to <2 x i64>
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        byte16 sa = cast(byte16)a;
+        long2 r;
+        foreach(n; 0..2)
+            r.ptr[n] = sa.array[n];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepi8_epi64(A);
+    long[2] correct = [127, -128];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepu16_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxwd128(cast(short8)a);
+    }
+    else
+    {
+        // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
+        //     arm64: ushll since LDC 1.12 -O1
+        short8 sa = cast(short8)a;
+        int4 r;
+        r.ptr[0] = cast(ushort)sa.array[0];
+        r.ptr[1] = cast(ushort)sa.array[1];
+        r.ptr[2] = cast(ushort)sa.array[2];
+        r.ptr[3] = cast(ushort)sa.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepu16_epi32(A);
+    int[4] correct = [65535, 0, 32768, 32767];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 16-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepu16_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxwq128(cast(short8)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: a bit shorter than below, in -O2
+        short8 sa = cast(short8)a;
+        long2 r;
+        for(int n = 0; n < 2; ++n)
+            r.ptr[n] = cast(ushort)sa.array[n];
+        return cast(__m128i)r;
+    }
+    else
+    {
+        // LDC x86: generates pmovzxwd since LDC 1.12 -O1 also good without SSE4.1
+        short8 sa = cast(short8)a;
+        long2 r;
+        r.ptr[0] = cast(ushort)sa.array[0];
+        r.ptr[1] = cast(ushort)sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(-1, 0, -32768, 32767, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepu16_epi64(A);
+    long[2] correct = [65535, 0];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 32-bit integers in `a` to packed 64-bit integers.
+__m128i _mm_cvtepu32_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxdq128(cast(short8)a);
+    }
+    else
+    {
+        // LDC x86: generates pmovzxdq since LDC 1.12 -O1 also good without SSE4.1
+        //     arm64: generates ushll since LDC 1.12 -O1
+        int4 sa = cast(int4)a;
+        long2 r;
+        r.ptr[0] = cast(uint)sa.array[0];
+        r.ptr[1] = cast(uint)sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(-1, 42, 0, 0);
+    long2 C = cast(long2) _mm_cvtepu32_epi64(A);
+    long[2] correct = [4294967295, 42];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers.
+__m128i _mm_cvtepu8_epi16 (__m128i a) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmovzxbw128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_optimizations)
+    {
+        enum ir = `
+            %v = shufflevector <16 x i8> %0,<16 x i8> %0, <8 x i32> <i32 0, i32 1,i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+            %r = zext <8 x i8> %v to <8 x i16>
+            ret <8 x i16> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, short8, byte16)(cast(byte16)a);
+    }
+    else
+    {
+        return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    short8 C = cast(short8) _mm_cvtepu8_epi16(A);
+    short[8] correct = [127, 128, 1, 255, 0, 2, 252, 248];
+    assert(C.array == correct);
+}
+
+
+/// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers.
+__m128i _mm_cvtepu8_epi32 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i) __builtin_ia32_pmovzxbd128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: a bit better than below in -O2
+        byte16 sa = cast(byte16)a;
+        int4 r;
+        for(int n = 0; n < 4; ++n) 
+            r.ptr[n] = cast(ubyte)sa.array[n];
+        return cast(__m128i)r;
+    }
+    else
+    {
+        // LDC x86: generates pmovzxbd since LDC 1.12 -O1 also good without SSE4.1
+        // PERF: catastrophic with GDC without SSE4.1
+        byte16 sa = cast(byte16)a;
+        int4 r;
+        r.ptr[0] = cast(ubyte)sa.array[0];
+        r.ptr[1] = cast(ubyte)sa.array[1];
+        r.ptr[2] = cast(ubyte)sa.array[2];
+        r.ptr[3] = cast(ubyte)sa.array[3];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -128, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    int4 C = cast(int4) _mm_cvtepu8_epi32(A);
+    int[4] correct = [127, 128, 1, 255];
+    assert(C.array == correct);
+}
+
+/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of `a` to packed 64-bit integers.
+__m128i _mm_cvtepu8_epi64 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        alias ubyte16 = __vector(ubyte[16]);
+        return cast(__m128i)__builtin_ia32_pmovzxbq128(cast(ubyte16)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // LDC arm64: this optimizes better than the loop below
+        byte16 sa = cast(byte16)a;
+        long2 r;
+        for (int n = 0; n < 2; ++n)
+            r.ptr[n] = cast(ubyte)sa.array[n];
+        return cast(__m128i)r;
+    }
+    else
+    {
+        // LDC x86: Generates pmovzxbq since LDC 1.1 -O0, a pshufb without SSE4.1
+        byte16 sa = cast(byte16)a;
+        long2 r;
+        r.ptr[0] = cast(ubyte)sa.array[0];
+        r.ptr[1] = cast(ubyte)sa.array[1];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127, -2, 1, -1, 0, 2, -4, -8, 0, 0, 0, 0, 0, 0, 0, 0);
+    long2 C = cast(long2) _mm_cvtepu8_epi64(A);
+    long[2] correct = [127, 254];
+    assert(C.array == correct);
+}
+
+/// Conditionally multiply the packed double-precision (64-bit) floating-point elements 
+/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, and conditionally
+/// store the sum in dst using the low 4 bits of `imm8`.
+__m128d _mm_dp_pd(int imm8)(__m128d a, __m128d b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_dppd(a, b, imm8 & 0x33);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_dppd(a, b, imm8 & 0x33);
+    }
+    else
+    {
+        __m128d zero = _mm_setzero_pd();
+        __m128d temp = _mm_blend_pd!( (imm8 >>> 4) & 3)(zero, a * b);
+        double sum = temp.array[0] + temp.array[1];
+        return _mm_blend_pd!(imm8 & 3)(zero, _mm_set1_pd(sum));
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.0, 2.0);
+    __m128d B = _mm_setr_pd(4.0, 8.0);
+    double2 R1 = _mm_dp_pd!(0x10 + 0x3 + 0x44)(A, B);
+    double2 R2 = _mm_dp_pd!(0x20 + 0x1 + 0x88)(A, B);
+    double2 R3 = _mm_dp_pd!(0x30 + 0x2 + 0x00)(A, B);
+    double[2] correct1 = [ 4.0,  4.0];
+    double[2] correct2 = [16.0,  0.0];
+    double[2] correct3 = [ 0.0, 20.0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+    assert(R3.array == correct3);
+}
+
+/// Conditionally multiply the packed single-precision (32-bit) floating-point elements 
+/// in `a` and `b` using the high 4 bits in `imm8`, sum the four products, 
+/// and conditionally store the sum in result using the low 4 bits of `imm8`.
+__m128 _mm_dp_ps(int imm8)(__m128 a, __m128 b) @trusted
+{
+      // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_dpps(a, b, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_dpps(a, b, cast(byte)imm8);
+    }
+    else
+    {
+        __m128 zero = _mm_setzero_ps();
+        __m128 temp = _mm_blend_ps!( (imm8 >>> 4) & 15)(zero, a * b);
+        float sum = temp.array[0] + temp.array[1] + temp.array[2] + temp.array[3];
+        return _mm_blend_ps!(imm8 & 15)(zero, _mm_set1_ps(sum));
+    }        
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 4.0f, 8.0f);
+    __m128 B = _mm_setr_ps(9.0f, 7.0f, 5.0f, 3.0f);
+    float4 R1 = _mm_dp_ps!(0xf0 + 0xf)(A, B);
+    float4 R2 = _mm_dp_ps!(0x30 + 0x5)(A, B);
+    float4 R3 = _mm_dp_ps!(0x50 + 0xa)(A, B);
+    float[4] correct1 =   [67.0f, 67.0f, 67.0f, 67.0f];
+    float[4] correct2 =   [23.0f, 0.0f, 23.0f, 0.0f];
+    float[4] correct3 =   [0.0f, 29.0f, 0.0f, 29.0f];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+    assert(R3.array == correct3);
+}
+
+
+/// Extract a 32-bit integer from `a`, selected with `imm8`.
+int _mm_extract_epi32 (__m128i a, const int imm8) pure @trusted
+{
+    return (cast(int4)a).array[imm8 & 3];
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    assert(_mm_extract_epi32(A, 0) == 1);
+    assert(_mm_extract_epi32(A, 1 + 8) == 2);
+    assert(_mm_extract_epi32(A, 3 + 4) == 4);
+}
+
+/// Extract a 64-bit integer from `a`, selected with `imm8`.
+long _mm_extract_epi64 (__m128i a, const int imm8) pure @trusted
+{
+    long2 la = cast(long2)a;
+    return la.array[imm8 & 1];
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(45, -67);
+    assert(_mm_extract_epi64(A, 0) == 45);
+    assert(_mm_extract_epi64(A, 1) == -67);
+    assert(_mm_extract_epi64(A, 2) == 45);
+}
+
+/// Extract an 8-bit integer from `a`, selected with `imm8`.
+/// Warning: the returned value is zero-extended to 32-bits.
+int _mm_extract_epi8 (__m128i a, const int imm8) @trusted
+{
+    byte16 ba = cast(byte16)a;
+    return cast(ubyte) ba.array[imm8 & 15];
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, 14, 15);
+    assert(_mm_extract_epi8(A, 7) == 7);
+    assert(_mm_extract_epi8(A, 13) == 255);
+    assert(_mm_extract_epi8(A, 7 + 16) == 7);
+}
+
+/// Extract a single-precision (32-bit) floating-point element from `a`, selected with `imm8`.
+/// Note: returns a 32-bit $(I integer).
+int _mm_extract_ps (__m128 a, const int imm8) @trusted
+{
+    return (cast(int4)a).array[imm8 & 3];
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, -4.0f);
+    assert(_mm_extract_ps(A, 0) == 0x3f800000);
+    assert(_mm_extract_ps(A, 1 + 8) == 0x40000000);
+    assert(_mm_extract_ps(A, 3 + 4) == cast(int)0xc0800000);
+}
+
+
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` down to an 
+/// integer value, and store the results as packed double-precision floating-point elements.
+__m128d _mm_floor_pd (__m128d a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O2
+        long2 l = vcvtmq_s64_f64(a);
+        double2 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        return r;
+    }
+    else
+    {
+        return _mm_round_pd!1(a);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3f, -2.12f);
+    __m128d B = _mm_setr_pd(53.6f, -2.7f);
+    A = _mm_floor_pd(A);
+    B = _mm_floor_pd(B);
+    double[2] correctA = [1.0, -3.0];
+    double[2] correctB = [53.0, -3.0];
+    assert(A.array == correctA);
+    assert(B.array == correctB);
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` down to an 
+/// integer value, and store the results as packed single-precision floating-point elements.
+__m128 _mm_floor_ps (__m128 a) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        // LDC arm64 acceptable since 1.8 -O1
+        int4 l = vcvtmq_s32_f32(a);
+        float4 r;
+        r.ptr[0] = l.array[0];
+        r.ptr[1] = l.array[1];
+        r.ptr[2] = l.array[2];
+        r.ptr[3] = l.array[3];
+        return r;
+    }
+    else
+    {
+        return _mm_round_ps!1(a);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, 53.6f, -2.7f);
+    __m128 C = _mm_floor_ps(A);
+    float[4] correct = [1.0f, -3.0f, 53.0f, -3.0f];
+    assert(C.array == correct);
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in `b` down to an 
+/// integer value, store the result as a double-precision floating-point element in the 
+/// lower element, and copy the upper element from `a` to the upper element.
+__m128d _mm_floor_sd (__m128d a, __m128d b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtms_s64_f64(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_sd!1(a, b);
+    }
+}
+unittest
+{
+    __m128d A = _mm_setr_pd(1.3, -2.12);
+    __m128d B = _mm_setr_pd(-53.1, -3.7);
+    __m128d C = _mm_floor_sd(A, B);
+    double[2] correct = [-54.0, -2.12];
+    assert(C.array == correct);
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in `b` down to an
+/// integer value, store the result as a single-precision floating-point element in the
+/// lower element, and copy the upper 3 packed elements from `a` to the upper elements.
+__m128 _mm_floor_ss (__m128 a, __m128 b) @trusted
+{
+    static if (LDC_with_ARM64)
+    {
+        a[0] = vcvtms_s32_f32(b[0]);
+        return a;
+    }
+    else
+    {
+        return _mm_round_ss!1(a, b);
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.3f, -2.12f, -4.5f, 1.1f);
+    __m128 B = _mm_setr_ps(-539.3f, -3.7f, 8.0f, 7.0f);
+    __m128 C = _mm_floor_ss(A, B);
+    float[4] correct = [-540.0f, -2.12f, -4.5f, 1.1f];
+    assert(C.array == correct);
+}
+
+/// Insert the 32-bit integer `i` into `a` at the location specified by `imm8[1:0]`.
+__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8) pure @trusted
+{
+    // GDC: nothing special to do, pinsrd generated with -O1 -msse4.1
+    // LDC x86: psinrd since LDC 1.1 -O2 with -mattr=+sse4.1
+    // LDC arm64: ins.s since LDC 1.8 -O2
+    int4 ia = cast(int4)a;
+    ia.ptr[imm8 & 3] = i;
+    return cast(__m128i)ia; 
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(1, 2, 3, 4);
+    int4 C = cast(int4) _mm_insert_epi32(A, 5, 2 + 4);
+    int[4] result = [1, 2, 5, 4];
+    assert(C.array == result);
+}
+
+/// Insert the 64-bit integer `i` into `a` at the location specified by `imm8[0]`.
+__m128i _mm_insert_epi64 (__m128i a, long i, const int imm8) pure @trusted
+{
+    // GDC: nothing special to do, psinrq generated with -O1 -msse4.1
+    // LDC x86: always do something sensible.
+    long2 la = cast(long2)a;
+    la.ptr[imm8 & 1] = i;
+    return cast(__m128i)la;
+}
+unittest
+{
+    __m128i A = _mm_setr_epi64(1, 2);
+    long2 C = cast(long2) _mm_insert_epi64(A, 5, 1 + 2);
+    long[2] result = [1, 5];
+    assert(C.array == result);
+}
+
+/// Insert the 8-bit integer `i` into `a` at the location specified by `imm8[2:0]`.
+/// Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
+__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8) @trusted
+{
+    // GDC: nothing special to do, pinsrb generated with -O1 -msse4.1
+    // LDC x86: doesn't do pinsrb, maybe it's slower. arm64 also spills to memory.
+    byte16 ba = cast(byte16)a;
+    ba.ptr[imm8 & 15] = cast(byte)i;
+    return cast(__m128i)ba; 
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    byte16 C = cast(byte16) _mm_insert_epi8(A, 30, 4 + 16);
+    byte[16] result = [0, 1, 2, 3, 30, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+    assert(C.array == result);
+}
+
+
+/// Warning: of course it does something totally different from `_mm_insert_epi32`!
+/// Copy `a` to `tmp`, then insert a single-precision (32-bit) floating-point element from `b` 
+/// into `tmp` using the control in `imm8`. Store `tmp` to result using the mask in `imm8[3:0]` 
+/// (elements are zeroed out when the corresponding bit is set).
+__m128 _mm_insert_ps(int imm8)(__m128 a, __m128 b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_insertps128(a, b, cast(ubyte)imm8);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_insertps128(a, b, cast(byte)imm8);
+    }
+    else
+    {
+        float4 tmp2 = a;
+        float tmp1 = b.array[(imm8 >> 6) & 3];
+        tmp2.ptr[(imm8 >> 4) & 3] = tmp1;
+        return _mm_blend_ps!(imm8 & 15)(tmp2, _mm_setzero_ps());
+    }
+}
+unittest
+{
+    __m128 A = _mm_setr_ps(1.0f, 2.0f, 3.0f, 4.0f);
+    __m128 B = _mm_setr_ps(5.0f, 6.0f, 7.0f, 8.0f);
+    __m128 C = _mm_insert_ps!(128 + (32 + 16) + 4)(A, B);
+    float[4] correct =    [1.0f, 2.0f, 0.0f, 7.0f];
+    assert(C.array == correct);
+}
+
+
+/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_max_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxsd128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxsd since LDC 1.1 -O1
+        // ARM: smax.4s since LDC 1.8 -01
+        int4 sa = cast(int4)a;
+        int4 sb = cast(int4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            int4 greater = sa > sb;
+        else
+            int4 greater = greaterMask!int4(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i higher = _mm_cmpgt_epi32(a, b);
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_max_epi32(_mm_setr_epi32(0x7fffffff, 1, -4, 7),
+                                      _mm_setr_epi32(        -4,-8,  9, -8));
+    int[4] correct =                               [0x7fffffff, 1,  9,  7];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b`, 
+/// and return packed maximum values.
+__m128i _mm_max_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxsb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxsb since LDC 1.1 -O1
+        // ARM64: smax.16b since LDC 1.8.0 -O1
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            byte16 greater = sa > sb;
+        else
+            byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i lower = _mm_cmpgt_epi8(a, b); // ones where a should be selected, b else
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, lower);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
+    byte16 R = cast(byte16) _mm_max_epi8(A, B);
+    byte[16] correct =       [127,  1,  9, -7, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 16-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_max_epu16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxuw128(cast(short8)a, cast(short8)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxuw since LDC 1.1 -O1
+        // ARM64: umax.8h since LDC 1.8.0 -O1
+        // PERF: without sse4.1, LLVM 12 produces a very interesting
+        //          psubusw xmm0, xmm1
+        //          paddw   xmm0, xmm1
+        //       sequence that maybe should go in other min/max intrinsics? 
+        ushort8 sa = cast(ushort8)a;
+        ushort8 sb = cast(ushort8)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+        {
+            // Note: doesn't work well with GDC, which prefers the builtin.
+            ushort8 greater = sa > sb;
+        }
+        else
+            ushort8 greater = cast(ushort8) greaterMask!ushort8(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        b = _mm_subs_epu16(b, a);
+        b = _mm_add_epi16(b, a);
+        return b;
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_max_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
+                                          _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
+    short[8] correct =                                  [   -4, -8, -4, -7, 9,-32768, 0, 57];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 32-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_max_epu32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmaxud128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pmaxud since LDC 1.1 -O1, also good without sse4.1
+        // ARM64: umax.4s since LDC 1.8.0 -O1
+        uint4 sa = cast(uint4)a;
+        uint4 sb = cast(uint4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            uint4 greater = sa > sb;
+        else
+            uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        // PERF: LLVM suggests to replace the _mm_add_epi32 by _mm_xor_si128, and the last xor by an "_mm_or_si128"
+        /+
+        movdqa  xmm2, xmmword ptr [-0x80000000, -0x80000000, -0x80000000, -0x80000000]
+        movdqa  xmm3, xmm1
+        pxor    xmm3, xmm2
+        pxor    xmm2, xmm0
+        pcmpgtd xmm2, xmm3
+        pand    xmm0, xmm2
+        pandn   xmm2, xmm1
+        por     xmm0, xmm2
+        +/
+        __m128i valueShift = _mm_set1_epi32(-0x80000000);
+        __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(a, valueShift), _mm_add_epi32(b, valueShift));
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_max_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
+                                      _mm_setr_epi32(        -4,-8,  9, -8));
+    int[4] correct =                                [        -4,-8,  9, -7];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 32-bit integers in `a` and `b`, returns packed maximum values.
+__m128i _mm_min_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminsd128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminsd since LDC 1.1 -O1, also good without sse4.1
+        // ARM: smin.4s since LDC 1.8 -01
+        int4 sa = cast(int4)a;
+        int4 sb = cast(int4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            int4 greater = sa > sb;
+        else
+            int4 greater = greaterMask!int4(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        __m128i higher = _mm_cmplt_epi32(a, b);
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_min_epi32(_mm_setr_epi32(0x7fffffff,  1, -4, 7),
+                                      _mm_setr_epi32(        -4, -8,  9, -8));
+    int[4] correct =                               [         -4, -8, -4, -8];
+    assert(R.array == correct);
+}
+
+/// Compare packed signed 8-bit integers in `a` and `b`, 
+/// and return packed minimum values.
+__m128i _mm_min_epi8 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminsb128(cast(ubyte16)a, cast(ubyte16)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminsb since LDC 1.1 -O1
+        // ARM64: smin.16b since LDC 1.8.0 -O1
+        byte16 sa = cast(byte16)a;
+        byte16 sb = cast(byte16)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            byte16 greater = sa > sb;
+        else
+            byte16 greater = cast(byte16) greaterMask!byte16(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        __m128i lower = _mm_cmplt_epi8(a, b); // ones where a should be selected, b else
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, lower);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(127,  1, -4, -8, 9,    7, 0, 57, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m128i B = _mm_setr_epi8(  4, -8,  9, -7, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0);
+    byte16 R = cast(byte16) _mm_min_epi8(A, B);
+    byte[16] correct =       [  4, -8, -4, -8, 0, -128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
+__m128i _mm_min_epu16 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminuw128(cast(short8)a, cast(short8)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminuw since LDC 1.1 -O1, psubusw+psubw sequence without sse4.1
+        // ARM64: umin.8h since LDC 1.8.0 -O1
+        ushort8 sa = cast(ushort8)a;
+        ushort8 sb = cast(ushort8)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            ushort8 greater = (sb > sa);
+        else
+            ushort8 greater = cast(ushort8) greaterMask!ushort8(sb, sa);
+        return cast(__m128i)( (greater & sa) | (~greater & sb) );
+    }
+    else
+    {
+        __m128i c = _mm_subs_epu16(b, a);
+        b = _mm_sub_epi16(b, c);
+        return b;
+    }
+}
+unittest
+{
+    short8 R = cast(short8) _mm_min_epu16(_mm_setr_epi16(32767,  1, -4, -8, 9,     7, 0, 57),
+                                          _mm_setr_epi16(   -4, -8,  9, -7, 0,-32768, 0,  0));
+    short[8] correct =                                  [32767,  1,  9, -8, 0,     7, 0,  0];
+    assert(R.array == correct);
+}
+
+/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
+__m128i _mm_min_epu32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pminud128(cast(int4)a, cast(int4)b);
+    }
+    else version(LDC)
+    {
+        // x86: pminud since LDC 1.1 -O1, also good without sse4.1
+        // ARM64: umin.4s since LDC 1.8.0 -O1
+        uint4 sa = cast(uint4)a;
+        uint4 sb = cast(uint4)b;
+        static if (SIMD_COMPARISON_MASKS_16B)
+            uint4 greater = sa > sb;
+        else
+            uint4 greater = cast(uint4) greaterMask!uint4(sa, sb);
+        return cast(__m128i)( (~greater & sa) | (greater & sb) );
+    }
+    else
+    {
+        // PERF: same remark as in _mm_max_epu32
+        __m128i valueShift = _mm_set1_epi32(-0x80000000);
+        __m128i higher = _mm_cmpgt_epi32(_mm_add_epi32(b, valueShift), _mm_add_epi32(a, valueShift));
+        __m128i aTob = _mm_xor_si128(a, b); // a ^ (a ^ b) == b
+        __m128i mask = _mm_and_si128(aTob, higher);
+        return _mm_xor_si128(b, mask);
+    }
+}
+unittest
+{
+    int4 R = cast(int4) _mm_min_epu32(_mm_setr_epi32(0x7fffffff, 1,  4, -7),
+                                      _mm_setr_epi32(        -4,-8,  9, -8));
+    int[4] correct =                                [0x7fffffff, 1,  4, -8];
+    assert(R.array == correct);
+}
+
+/// Horizontally compute the minimum amongst the packed unsigned 16-bit integers in `a`, 
+/// store the minimum and index in return value, and zero the remaining bits.
+__m128i _mm_minpos_epu16 (__m128i a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_phminposuw128(cast(short8)a);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        __m128i indices = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+        __m128i combinedLo = _mm_unpacklo_epi16(indices, a);
+        __m128i combinedHi = _mm_unpackhi_epi16(indices, a);
+        __m128i best = _mm_min_epu32(combinedLo, combinedHi);
+        best = _mm_min_epu32(best, _mm_srli_si128!8(best));
+        best = _mm_min_epu32(best, _mm_srli_si128!4(best));
+        short8 sbest = cast(short8)best;
+        short8 r;
+        r[0] = sbest[1];
+        r[1] = sbest[0]; // Note: the search must have inverted index in order to prioritize lower index in case of tie
+        r[2] = 0;
+        r[3] = 0;
+        r[4] = 0;
+        r[5] = 0;
+        r[6] = 0;
+        r[7] = 0;
+        return cast(__m128i)r;
+    }
+    else
+    {
+        short8 sa = cast(short8)a;
+        ushort min = 0xffff;
+        int index = 0;
+        for(int n = 0; n < 8; ++n)
+        {
+            ushort c = sa.array[n];
+            if (c < min)
+            {
+                min = c;
+                index = n;
+            }
+        }
+        short8 r;
+        r.ptr[0] = min;
+        r.ptr[1] = cast(short)index;
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi16(14, 15, 1, 2, -3, 4, 5, 6);
+    __m128i B = _mm_setr_epi16(14,  4, 4, 2, -3, 2, 5, 6);
+    short8 R1 = cast(short8) _mm_minpos_epu16(A);
+    short8 R2 = cast(short8) _mm_minpos_epu16(B);
+    short[8] correct1 = [1, 2, 0, 0, 0, 0, 0, 0];
+    short[8] correct2 = [2, 3, 0, 0, 0, 0, 0, 0];
+    assert(R1.array == correct1);
+    assert(R2.array == correct2);
+}
+
+/// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers 
+/// in `a` compared to those in `b`, and store the 16-bit results in dst. 
+/// Eight SADs are performed using one quadruplet from `b` and eight quadruplets from `a`. 
+/// One quadruplet is selected from `b` starting at on the offset specified in `imm8[1:0]`. 
+/// Eight quadruplets are formed from sequential 8-bit integers selected from `a` starting 
+/// at the offset specified in `imm8[2]`.
+__m128i _mm_mpsadbw_epu8(int imm8)(__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_mpsadbw128(cast(ubyte16)a, cast(ubyte16)b, cast(ubyte)imm8);  
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_mpsadbw128(cast(byte16)a, cast(byte16)b, cast(byte)imm8);
+    }
+    else
+    {
+        int a_offset = ((imm8 & 4) >> 2) * 4; // Yes, the two high order quadruplet are unaddressable...
+        int b_offset = (imm8 & 3) * 4;
+
+        byte16 ba = cast(byte16)a;
+        byte16 bb = cast(byte16)b;
+        short8 r;
+
+        __m128i comp_b = _mm_setr_epi32(b.array[imm8 & 3], 0, b.array[imm8 & 3], 0);
+
+        for (int j = 0; j < 8; j += 2)
+        {
+            int k = a_offset + j;
+            __m128i comp_a = _mm_setr_epi8(ba[k+0], ba[k+1], ba[k+2], ba[k+3],
+                                           0, 0, 0, 0, 
+                                           ba[k+1], ba[k+2], ba[k+3], ba[k+4],
+                                           0, 0, 0, 0);
+            short8 diffs = cast(short8) _mm_sad_epu8(comp_a, comp_b); // reusing this wins instructions in both x86 and arm64
+            r.ptr[j] = diffs.array[0];
+            r.ptr[j+1] = diffs.array[4];
+        }
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi8(0, 1, 2, 3,  4,  5, 6,  7, 8, 9, 10, 11, 12, 13, 14, 15);
+    __m128i B = _mm_setr_epi8(9, 1, 2, 3, -1, -1, 0, -1, 5, 5,  5,  5, 12, 13, 14, 15);
+    short[8] correct0 = [9, 11, 13, 15, 17, 19, 21, 23];
+    short[8] correct1 = [763, 761, 759, 757, 755, 753, 751, 749];
+    short[8] correct4 = [17, 19, 21, 23, 25, 27, 31, 35];
+    short[8] correct5 = [755, 753, 751, 749, 747, 745, 743, 741];
+    short[8] correct7 = [32, 28, 24, 20, 16, 12, 8, 4];
+    short8 r1 = cast(short8) _mm_mpsadbw_epu8!1(A, B);
+    short8 r4 = cast(short8) _mm_mpsadbw_epu8!4(A, B);
+    short8 r5 = cast(short8) _mm_mpsadbw_epu8!5(A, B);
+    short8 r7 = cast(short8) _mm_mpsadbw_epu8!7(A, B);
+    short8 r8 = cast(short8) _mm_mpsadbw_epu8!8(A, B);
+    assert(r1.array == correct1);
+    assert(r4.array == correct4);
+    assert(r5.array == correct5);
+    assert(r7.array == correct7);
+    assert(r8.array == correct0);
+}
+
+/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
+__m128i _mm_mul_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_pmuldq128(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_SSE41 && LDC_with_optimizations)
+    {
+        // For some reason, clang has the builtin but it's not in IntrinsicsX86.td
+        // Use IR instead.
+        // This generates pmuldq with since LDC 1.2.0 -O0 
+        enum ir = `
+            %ia = shufflevector <4 x i32> %0,<4 x i32> %0, <2 x i32> <i32 0, i32 2>
+            %ib = shufflevector <4 x i32> %1,<4 x i32> %1, <2 x i32> <i32 0, i32 2>
+            %la = sext <2 x i32> %ia to <2 x i64>
+            %lb = sext <2 x i32> %ib to <2 x i64>
+            %r = mul <2 x i64> %la, %lb
+            ret <2 x i64> %r`;
+        return cast(__m128i) LDCInlineIR!(ir, long2, int4, int4)(cast(int4)a, cast(int4)b);
+    }
+    else static if (LDC_with_ARM64)  
+    {
+        // 3 instructions since LDC 1.8 -O2
+        // But had to make vmull_s32 be a builtin else it wouldn't optimize to smull
+        int2 a_lo = vmovn_s64(cast(long2)a);
+        int2 b_lo = vmovn_s64(cast(long2)b);
+        return cast(__m128i) vmull_s32(a_lo, b_lo);
+    }
+    else
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        long2 r;
+        r.ptr[0] = cast(long)ia.array[0] * ib.array[0];
+        r.ptr[1] = cast(long)ia.array[2] * ib.array[2];
+        return cast(__m128i)r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
+    __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
+    long2 R = cast(long2) _mm_mul_epi32(A, B);
+    long[2] correct = [cast(long)61616461 * 49716422, cast(long)4564061 * -121144];
+    assert(R.array == correct);
+}
+
+/// Multiply the packed 32-bit integers in `a` and `b`, producing intermediate 64-bit integers, 
+/// return the low 32 bits of the intermediate integers.
+__m128i _mm_mullo_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    // PERF GDC without SSE4.1 could be better
+    static if (GDC_with_SSE41)
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        // Note: older GDC doesn't have that op, but older GDC
+        // also has no support for -msse4.1 detection
+        return cast(__m128i)(a * b); 
+    }
+    else version(LDC)
+    {
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        return cast(__m128i)(a * b);
+    }
+    else
+    {
+        // DMD doesn't take the above
+        int4 ia = cast(int4)a;
+        int4 ib = cast(int4)b;
+        int4 r;
+        r.ptr[0] = ia.array[0] * ib.array[0];
+        r.ptr[1] = ia.array[1] * ib.array[1];
+        r.ptr[2] = ia.array[2] * ib.array[2];
+        r.ptr[3] = ia.array[3] * ib.array[3];
+        return r;
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(61616461, 1915324654, 4564061, 3);
+    __m128i B = _mm_setr_epi32(49716422, -915616216, -121144, 0);
+    int4 R = cast(int4) _mm_mullo_epi32(A, B);
+    int[4] correct = [cast(int)0xBF370D8E, cast(int)(1915324654 * -915616216), cast(int)(4564061 * -121144), 0];
+    assert(R.array == correct);
+}
+
+
+/// Convert packed signed 32-bit integers from `a` and `b` 
+/// to packed 16-bit integers using unsigned saturation.
+__m128i _mm_packus_epi32 (__m128i a, __m128i b) pure @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_packusdw128(cast(short8)a, cast(short8)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+       int4 z;
+       z = 0;       
+       return cast(__m128i) vcombine_u16(vqmovn_u32(vmaxq_s32(z, cast(int4)a)),
+                                         vqmovn_u32(vmaxq_s32(z, cast(int4)b)));
+    }
+    else
+    {
+        __m128i i32768 = _mm_set1_epi32(32768);
+        __m128i s32768 = _mm_set1_epi16(-32768);
+        a = _mm_sub_epi32(a, i32768);
+        b = _mm_sub_epi32(b, i32768);
+        __m128i clampedSigned = _mm_packs_epi32(a, b);
+        return _mm_add_epi16(clampedSigned, s32768);
+    }
+}
+unittest
+{
+    __m128i A = _mm_setr_epi32(100000, -100000, 1000, 0);
+    short8 R = cast(short8) _mm_packus_epi32(A, A);
+    short[8] correct = [cast(short)65535, 0, 1000, 0, cast(short)65535, 0, 1000, 0];
+    assert(R.array == correct);
+}
+
+
+/// Round the packed double-precision (64-bit) floating-point elements in `a` using the 
+/// rounding parameter, and store the results as packed double-precision floating-point elements.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128d _mm_round_pd(int rounding)(__m128d a) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_roundpd(a, rounding);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundpd(a, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            // Convert to 64-bit integers
+            long lo = _mm_cvtsd_si64(a);
+            a.ptr[0] = a.array[1];
+            long hi = _mm_cvtsd_si64(a);
+            return _mm_setr_pd(lo, hi);
+        }
+        else
+        {
+            version(GNU) pragma(inline, false); // else fail unittest with optimizations
+
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            
+            // Convert to 64-bit integers
+            long lo = _mm_cvtsd_si64(a);
+            a.ptr[0] = a.array[1];
+            long hi = _mm_cvtsd_si64(a);
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 64-bit double can't represent all the values 
+            // a 64-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return _mm_setr_pd(lo, hi);
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+/// Round the packed single-precision (32-bit) floating-point elements in `a` using the 
+/// rounding parameter, and store the results as packed single-precision floating-point elements.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128 _mm_round_ps(int rounding)(__m128 a) @trusted
+{
+    // PERF ARM64: there is duplication because this isn't optimal for ARM64, so it is avoided externally
+    static if (GDC_or_LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundps(a, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            __m128i integers = _mm_cvtps_epi32(a);
+            return _mm_cvtepi32_ps(integers);
+        }
+        else
+        {
+            version(LDC) pragma(inline, false); // else _MM_SET_ROUNDING_MODE and _mm_cvtps_epi32 gets shuffled
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            scope(exit) _MM_SET_ROUNDING_MODE(old);
+
+            // Convert to 64-bit integers
+            __m128i integers = _mm_cvtps_epi32(a);
+
+            // Convert back to float to achieve the rounding
+            // The problem is that a 32-float can't represent all the values 
+            // a 32-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            __m128 result = _mm_cvtepi32_ps(integers);
+
+            return result;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Round the lower double-precision (64-bit) floating-point element in `b` using the
+/// rounding parameter, store the result as a double-precision floating-point element 
+/// in the lower element of result, and copy the upper element from `a` to the upper element of result.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128d _mm_round_sd(int rounding)(__m128d a, __m128d b) @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_roundsd(a, b, rounding);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundsd(a, b, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            // Convert to 64-bit integer
+            long b0 = _mm_cvtsd_si64(b);
+            a.ptr[0] = b0;
+            return a;
+        }
+        else
+        {
+            version(GNU) pragma(inline, false); // else fail unittest with optimizations
+
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+            
+            // Convert to 64-bit integer
+            long b0 = _mm_cvtsd_si64(b);
+            a.ptr[0] = b0;
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 64-bit double can't represent all the values 
+            // a 64-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return a;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Round the lower single-precision (32-bit) floating-point element in `b` using the 
+/// rounding parameter, store the result as a single-precision floating-point element 
+/// in the lower element of result, and copy the upper 3 packed elements from `a`
+/// to the upper elements of result.
+/// Rounding is done according to the rounding[3:0] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+__m128 _mm_round_ss(int rounding)(__m128 a, __m128 b) @trusted
+{
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_roundss(a, b, rounding);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_roundss(a, b, rounding);
+    }
+    else
+    {
+        static if (rounding & _MM_FROUND_CUR_DIRECTION)
+        {
+            int b0 = _mm_cvtss_si32(b);
+            a.ptr[0] = b0;   
+            return a;
+        }
+        else version(GNU)
+        {
+            pragma(inline, false)
+            __m128 GDCworkaround() nothrow @nogc @trusted 
+            {
+                uint old = _MM_GET_ROUNDING_MODE();
+                _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+
+                // Convert to 32-bit integer
+                int b0 = _mm_cvtss_si32(b);
+                a.ptr[0] = b0;       
+
+                // Convert back to double to achieve the rounding
+                // The problem is that a 32-bit float can't represent all the values 
+                // a 32-bit integer can (and vice-versa). So this function won't work for
+                // large values. (MAYDO: what range exactly?)
+                _MM_SET_ROUNDING_MODE(old);
+                return a;
+            }
+            return GDCworkaround();
+        }
+        else
+        {
+            uint old = _MM_GET_ROUNDING_MODE();
+            _MM_SET_ROUNDING_MODE((rounding & 3) << 13);
+
+            // Convert to 32-bit integer
+            int b0 = _mm_cvtss_si32(b);
+            a.ptr[0] = b0;       
+
+            // Convert back to double to achieve the rounding
+            // The problem is that a 32-bit float can't represent all the values 
+            // a 32-bit integer can (and vice-versa). So this function won't work for
+            // large values. (MAYDO: what range exactly?)
+            _MM_SET_ROUNDING_MODE(old);
+            return a;
+        }
+    }
+}
+unittest
+{
+    // tested in other intrinsics
+}
+
+
+/// Load 128-bits of integer data from memory using a non-temporal memory hint. 
+/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection 
+/// exception may be generated.
+__m128i _mm_stream_load_si128 (void* mem_addr) pure @trusted
+{
+    // PERF DMD D_SIMD
+    static if (GDC_with_SSE41)
+    {
+        return cast(__m128i) __builtin_ia32_movntdqa(cast(long2*)mem_addr);
+    }
+    else static if (LDC_with_InlineIREx && LDC_with_optimizations)
+    {
+        enum prefix = `!0 = !{ i32 1 }`;
+        enum ir = `
+            %r = load <4 x i32>, <4 x i32>* %0, !nontemporal !0
+            ret <4 x i32> %r`;
+        return cast(__m128i) LDCInlineIREx!(prefix, ir, "", int4, int4*)(cast(__m128i*)mem_addr);
+    }
+    else
+    {
+        return *cast(__m128i*)mem_addr; // regular move instead
+    }
+}
+unittest
+{
+    align(16) static immutable int[4] correct = [1, 2, 3, 4];
+    __m128i A = _mm_stream_load_si128(cast(__m128i*)(correct.ptr));
+    _mm_mfence();
+    assert(A.array == correct);
+}
+
+/// Return 1 if all bits in `a` are all 1's. Else return 0.
+int _mm_test_all_ones (__m128i a) @safe
+{
+    return _mm_testc_si128(a, _mm_set1_epi32(-1));
+}
+unittest
+{
+    __m128i A = _mm_set1_epi32(-1);
+    __m128i B = _mm_set_epi32(-1, -2, -1, -1);
+    assert(_mm_test_all_ones(A) == 1);
+    assert(_mm_test_all_ones(B) == 0);
+}
+
+/// Return 1 if all bits in `a` are all 0's. Else return 0.
+// This is a #BONUS since it was lacking in Intel Intrinsics API.
+int _mm_test_all_zeros (__m128i a) @safe
+{
+    return _mm_testz_si128(a, _mm_set1_epi32(-1));
+}
+unittest
+{
+    __m128i A = _mm_set1_epi32(0);
+    __m128i B = _mm_set_epi32(0, 8, 0, 0);
+    assert(_mm_test_all_zeros(A) == 1);
+    assert(_mm_test_all_zeros(B) == 0);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `mask`, 
+/// and return 1 if the result is zero, otherwise return 0.
+int _mm_test_all_zeros (__m128i a, __m128i mask) @safe
+{
+    return _mm_testz_si128(a, mask); // it's really the same, but with a good name
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and mask, and set ZF to 1 
+/// if the result is zero, otherwise set ZF to 0. Compute the bitwise NOT of a and then AND with 
+/// mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and
+/// CF values are zero, otherwise return 0.
+int _mm_test_mix_ones_zeros (__m128i a, __m128i mask) @trusted
+{
+    return _mm_testnzc_si128(a, mask);
+}
+
+/// Compute the bitwise NOT of a and then AND with b, and return 1 if the 
+/// result is zero, otherwise return 0.
+/// In other words, test if all bits masked by `b` are 1 in `a`.
+int _mm_testc_si128 (__m128i a, __m128i b) pure @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Acceptable since LDC 1.8 -02
+        long2 s64 = vbicq_s64(cast(long2)b, cast(long2)a);
+        return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    }
+    else
+    {
+        __m128i c = ~a & b;
+        int[4] zero = [0, 0, 0, 0];
+        return c.array == zero;
+    }
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
+    __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x00);
+    __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
+    assert(_mm_testc_si128(A, A) == 1);
+    assert(_mm_testc_si128(A, M1) == 0);
+    assert(_mm_testc_si128(A, M2) == 1);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and `b`, 
+/// and set ZF to 1 if the result is zero, otherwise set ZF to 0. 
+/// Compute the bitwise NOT of `a` and then AND with `b`, and set CF to 1 if the 
+/// result is zero, otherwise set CF to 0. 
+/// Return 1 if both the ZF and CF values are zero, otherwise return 0.
+int _mm_testnzc_si128 (__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestnzc128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        long2 s640 = vandq_s64(cast(long2)b, cast(long2)a);
+        long2 s641 = vbicq_s64(cast(long2)b, cast(long2)a);
+
+        return !( !(vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1))
+                | !(vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) );
+    }
+    else
+    {
+        __m128i c = a & b;
+        __m128i d = ~a & b;
+        int[4] zero = [0, 0, 0, 0];
+        return !( (c.array == zero) || (d.array == zero));
+    }    
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
+    __m128i M  = _mm_setr_epi32(0x01, 0x40, 0x00, 0x00);
+    __m128i Z = _mm_setzero_si128();
+    assert(_mm_testnzc_si128(A, Z) == 0);
+    assert(_mm_testnzc_si128(A, M) == 1);
+    assert(_mm_testnzc_si128(A, A) == 0);
+}
+
+/// Compute the bitwise AND of 128 bits (representing integer data) in a and b, 
+/// and return 1 if the result is zero, otherwise return 0.
+/// In other words, test if all bits masked by `b` are 0 in `a`.
+int _mm_testz_si128 (__m128i a, __m128i b) @trusted
+{
+    // PERF DMD
+    static if (GDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_SSE41)
+    {
+        return __builtin_ia32_ptestz128(cast(long2)a, cast(long2)b);
+    }
+    else static if (LDC_with_ARM64)
+    {
+        // Acceptable since LDC 1.8 -02
+        long2 s64 = vandq_s64(cast(long2)a, cast(long2)b);
+        return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+    }
+    else 
+    {
+        __m128i c = a & b;
+        int[4] zero = [0, 0, 0, 0];
+        return c.array == zero;
+    }    
+}
+unittest
+{
+    __m128i A  = _mm_setr_epi32(0x01, 0x02, 0x04, 0xf8);
+    __m128i M1 = _mm_setr_epi32(0xfe, 0xfd, 0x00, 0x07);
+    __m128i M2 = _mm_setr_epi32(0x00, 0x00, 0x04, 0x00);
+    assert(_mm_testz_si128(A, A) == 0);
+    assert(_mm_testz_si128(A, M1) == 1);
+    assert(_mm_testz_si128(A, M2) == 0);
+}
+
diff --git a/src/VulkanRenderer b/src/VulkanRenderer
index d46741a..c42238a 160000
--- a/src/VulkanRenderer
+++ b/src/VulkanRenderer
@@ -1 +1 @@
-Subproject commit d46741a48033b5136fa189c1b80a574986e68f64
+Subproject commit c42238a456f5048c7d1b2d5ebd71ecf13bb10ece
diff --git a/src/dlib b/src/dlib
index c83ffab..493c17c 160000
--- a/src/dlib
+++ b/src/dlib
@@ -1 +1 @@
-Subproject commit c83ffabce69071a3e7a0af3f26aa420082eeda1f
+Subproject commit 493c17cba26952861ae4c5402f1676013b8317c6
diff --git a/src/gears/game.d b/src/gears/game.d
index 2b08381..5985462 100644
--- a/src/gears/game.d
+++ b/src/gears/game.d
@@ -1,4 +1,5 @@
 import dlib;
+/*
 public import vulkan : PlatformHandles;
 import vulkan : Destroy;
 import vulkan;
@@ -414,6 +415,7 @@ DrawRect(Game* g, f32 p0_x, f32 p0_y, f32 p1_x, f32 p1_y, Vec4 col)
 	AddUIIndices(g);
 }
 
+/*
 // TODO: integrate this with vulkan again
 Model
 LoadModel(Game* g, string name)
@@ -697,3 +699,4 @@ ReadModel(Game* g, string name)
 		const(char)[] mat_name = m3d.material[i].name[0 .. strlen(m3d.material[i].name)];
 	}
 }
+*/
diff --git a/src/gears/game2.d b/src/gears/game2.d
index cec3b92..7a80349 100644
--- a/src/gears/game2.d
+++ b/src/gears/game2.d
@@ -10,8 +10,6 @@ ImageView[256] TEXTURES;
 Buffer[256]    MATERIALS;
 Buffer[256]    MODEL_STATES;
 
-DescIndices[DESC_SET_MAX] DESC_INDICES];
-
 struct GameState
 {
 	RenderState rds;
@@ -29,6 +27,7 @@ struct RenderState
 	Pipeline[PID.Max]        pipelines;
 	DescSetLayout            desc_layout_globals;
 	DescSetLayout            desc_layout_resources;
+	DescSetLayout            desc_layout_state;
 	DescSet[2]               desc_set_globals;
 	PipelineLayout           pipeline_layout_pbr;
 
@@ -41,13 +40,6 @@ struct RenderState
 	Buffer                   globals_buffer;
 }
 
-struct DescIndices
-{
-	u32 tex;
-	u32 mat;
-	u32 state;
-}
-
 struct ShaderGlobals
 {
 	Vec4 ambient;
@@ -57,41 +49,6 @@ struct ShaderGlobals
 	f32  alpha     = 0.0;
 }
 
-struct MeshPart
-{
-	u32       mat;
-	u32       offset;
-	u32       length;
-	PushConst pc;
-
-	alias pc this;
-}
-
-struct ModelData
-{
-	MeshPart[]    parts;
-	ModelState    state;
-	Vertex[]      v;
-	u32[]         idx;
-	TextureData[] tex;
-}
-
-struct TextureData
-{
-	u8[] name;
-	u8[] data;
-	u32  width;
-	u32  height;
-	u32  ch;
-}
-
-struct Model
-{
-	Buffer     v_buf;
-	Buffer     i_buf;
-	MeshPart[] parts;
-}
-
 struct ModelRenderInfo
 {
 	PushConst  pc;
@@ -105,15 +62,6 @@ struct ModelState
 	Mat4 matrix;
 }
 
-struct Material
-{
-	Vec4  ambient;
-	Vec4  diffuse;
-	Vec4  specular;
-	float shininess;
-	float alpha;
-}
-
 enum PBRMod : u32
 {
 	AlbedoValue     = 0x0001,
@@ -182,17 +130,14 @@ struct PushConst
 	}
 }
 
-struct Vertex
-{
-	Vec4 col;
-	Vec4 tangent;
-	Vec3 pos;
-	Vec3 normal;
-	Vec2 uv;
-}
-
 ModelData g_box;
 
+void
+RunCycle(GameState* g)
+{
+
+}
+
 GameState
 InitGame(PlatformWindow* window)
 {
@@ -225,7 +170,7 @@ Init(RenderState* rds, PlatformWindow* window)
 		{ binding: 2, descriptorType: DT.StorageImage,    descriptorCount: 1, stageFlags: SS.All },
 	];
 
-	DescLayoutBinding[3] resource_bindings = [
+	DescLayoutBinding[6] resource_bindings = [
 		{ binding: 0, descriptorType: DT.Image,   descriptorCount: 1, stageFlags: SS.All },
 		{ binding: 1, descriptorType: DT.Image,   descriptorCount: 1, stageFlags: SS.All },
 		{ binding: 2, descriptorType: DT.Image,   descriptorCount: 1, stageFlags: SS.All },
@@ -234,8 +179,12 @@ Init(RenderState* rds, PlatformWindow* window)
 		{ binding: 5, descriptorType: DT.Uniform, descriptorCount: 1, stageFlags: SS.All },
 	];
 
+	DescLayoutBinding[1] state_bindings = [
+		{ binding: 0, descriptorType: DT.Uniform, descriptorCount: 1, stageFlags: SS.All },
+	];
+
 	Attribute[5] attributes = [
-		{ binding: 0, location: 0, format: FMT.RGBA_F32, offset: Vertex.col.offsetof     },
+		{ binding: 0, location: 0, format: FMT.RGBA_F32, offset: Vertex.color.offsetof   },
 		{ binding: 0, location: 1, format: FMT.RGBA_F32, offset: Vertex.tangent.offsetof },
 		{ binding: 0, location: 2, format: FMT.RGB_F32,  offset: Vertex.pos.offsetof     },
 		{ binding: 0, location: 3, format: FMT.RGB_F32,  offset: Vertex.normal.offsetof  },
@@ -251,8 +200,9 @@ Init(RenderState* rds, PlatformWindow* window)
 
 	rds.desc_layout_globals   = CreateDescSetLayout(&rds.rd, global_bindings);
 	rds.desc_layout_resources = CreateDescSetLayout(&rds.rd, resource_bindings);
+	rds.desc_layout_state     = CreateDescSetLayout(&rds.rd, state_bindings);
 
-	rds.pipeline_layout_pbr   = CreatePipelineLayout(&rds.rd, [rds.desc_layout_globals, rds.desc_layout_resources], PushConst.sizeof);
+	rds.pipeline_layout_pbr   = CreatePipelineLayout(&rds.rd, [rds.desc_layout_globals, rds.desc_layout_resources, rds.desc_layout_state], PushConst.sizeof);
 
 	foreach(i; 0 .. 2)
 	{
@@ -273,8 +223,8 @@ Init(RenderState* rds, PlatformWindow* window)
 	};
 
 	GfxPipelineInfo pbr_info = {
-		vertex_shader:     LoadAssetData(&rds.frame_arenas[0], "shaders/pbr.vert.spv"),
-		frag_shader:       LoadAssetData(&rds.frame_arenas[0], "shaders/pbr.frag.spv"),
+		vertex_shader:     LoadFile(&rds.frame_arenas[0], "assets/shaders/pbr.vert.spv"),
+		frag_shader:       LoadFile(&rds.frame_arenas[0], "assets/shaders/pbr.frag.spv"),
 		input_rate:        IR.Vertex,
 		input_rate_stride: Vertex.sizeof,
 		layout:            rds.pipeline_layout_pbr,
@@ -315,7 +265,7 @@ Init(RenderState* rds, PlatformWindow* window)
 
 	CreateBuffer(&rds.rd, &rds.globals_buffer, BT.Uniform, ShaderGlobals.sizeof, false);
 
-	g_box = MakeBox
+	ModelData md = LoadGLTF(&rds.frame_arenas[0], "assets/models/DamagedHelmet.glb");
 }
 
 PipelineID
@@ -381,6 +331,7 @@ GetPBRMod(bool albedo = false, bool ambient = false, bool specular = false, bool
 	}
 }
 
+/*
 ModelData
 MakeBox(RenderState* rds, f32 width, f32 height, Vec4 col)
 {
@@ -428,9 +379,9 @@ Model
 Upload(RenderState* rds, ModelData* data)
 {
 	Model model;
-	u32[] tex_idx   = Alloc!(&rds.frame_arenas[0], data.text.length);
-	u32[] mat_idx   = Alloc!(&rds.frame_arenas[0], data.materials.length);
-	u32[] state_idx = Alloc!(&rds.frame_arenas[0], data.model_states.length);
+	u32[] tex_idx   = Alloc!(u32)(&rds.frame_arenas[0], data.text.length);
+	u32[] mat_idx   = Alloc!(u32)(&rds.frame_arenas[0], data.materials.length);
+	u32[] state_idx = Alloc!(u32)(&rds.frame_arenas[0], data.model_states.length);
 
 	bool result = true;
 
@@ -450,20 +401,18 @@ Upload(RenderState* rds, ModelData* data)
 	{
 		Buffer* buf = &rds.materials[rds.imat++];
 		CreateBuffer(&rds.rd, buf, BT.Uniform, Material.sizeof);
-		result = Transfer(&rds.rd, buf, )
+		//result = Transfer(&rds.rd, buf, )
 	}
 
 	for(u64 i = 0; i < data.model_states.length; i += 1)
 	{
 		Buffer* buf = &rds.model_states[rds.istate++];
-		CreateBuffer(&rds)
+		//CreateBuffer(&rds)
 	}
 
 	model.parts = data.parts;
 }
-
-DescIndices*
-GetDescIndices()
+*/
 
 unittest
 {
diff --git a/src/shaders/structures.layout b/src/shaders/structures.layout
index 5e18426..fafab1a 100644
--- a/src/shaders/structures.layout
+++ b/src/shaders/structures.layout
@@ -44,7 +44,8 @@ layout (set = 2, binding = 4) uniform MaterialData {
 	float shininess;
 	float alpha;
 } Material;
-layout (set = 2, binding = 5) uniform ModelState {
+
+layout (set = 3, binding = 0) uniform ModelState {
 	mat4 model_matrix;
 } State;